library(dplyr)
library(tidyverse)
library(DescTools)
library(vcdExtra)
library(gmodels)
library(ggplot2)
library(jmv)
library(mgcv)

#import csv
ovr50 = read.csv("/Users/jspade/Desktop/MSA Info/Networking/RTI assignment/main_practice01csv.csv")

#change martial_status to marital_status
names(ovr50)[names(ovr50)=='martial_status'] <- 'marital_status'

#view csv
# View(ovr50)

Exploring the data part 1

#Looking at summary statistics

#checking counts of what is in each column to see what needs to be cleaned
ovr50 %>% count(workclass, sort = TRUE) # has ? values
##          workclass     n
## 1          Private 33906
## 2 Self-emp-not-inc  3862
## 3        Local-gov  3136
## 4                ?  2799
## 5        State-gov  1981
## 6     Self-emp-inc  1695
## 7      Federal-gov  1432
## 8      Without-pay    21
## 9     Never-worked    10
ovr50 %>% count(education_level, sort = TRUE)
##    education_level     n
## 1          HS-grad 15784
## 2     Some-college 10878
## 3        Bachelors  8025
## 4          Masters  2657
## 5        Assoc-voc  2061
## 6             11th  1812
## 7       Assoc-acdm  1601
## 8             10th  1389
## 9          7th-8th   955
## 10     Prof-school   834
## 11             9th   756
## 12            12th   657
## 13       Doctorate   594
## 14         5th-6th   509
## 15         1st-4th   247
## 16       Preschool    83
ovr50 %>% count(education_num, sort = TRUE) 
##    education_num     n
## 1              9 15784
## 2             10 10878
## 3             13  8025
## 4             14  2657
## 5             11  2061
## 6              7  1812
## 7             12  1601
## 8              6  1389
## 9              4   955
## 10            15   834
## 11             5   756
## 12             8   657
## 13            16   594
## 14             3   509
## 15             2   247
## 16             1    83
ovr50 %>% count(marital_status, sort = TRUE)
##          marital_status     n
## 1    Married-civ-spouse 22379
## 2         Never-married 16117
## 3              Divorced  6633
## 4             Separated  1530
## 5               Widowed  1518
## 6 Married-spouse-absent   628
## 7     Married-AF-spouse    37
ovr50 %>% count(occupation, sort = TRUE) # has ? values
##           occupation    n
## 1     Prof-specialty 6172
## 2       Craft-repair 6112
## 3    Exec-managerial 6086
## 4       Adm-clerical 5611
## 5              Sales 5504
## 6      Other-service 4923
## 7  Machine-op-inspct 3022
## 8                  ? 2809
## 9   Transport-moving 2355
## 10 Handlers-cleaners 2072
## 11   Farming-fishing 1490
## 12      Tech-support 1446
## 13   Protective-serv  983
## 14   Priv-house-serv  242
## 15      Armed-Forces   15
ovr50 %>% count(relationship, sort = TRUE)
##     relationship     n
## 1        Husband 19716
## 2  Not-in-family 12583
## 3      Own-child  7581
## 4      Unmarried  5125
## 5           Wife  2331
## 6 Other-relative  1506
ovr50 %>% count(race, sort = TRUE)
##                 race     n
## 1              White 41762
## 2              Black  4685
## 3 Asian-Pac-Islander  1519
## 4 Amer-Indian-Eskimo   470
## 5              Other   406
ovr50 %>% count(sex, sort = TRUE)
##      sex     n
## 1   Male 32650
## 2 Female 16192
ovr50 %>% count(capital_gain, sort = TRUE) #left skewed with mainly 0's
##     capital_gain     n
## 1              0 44807
## 2          15024   513
## 3           7688   410
## 4           7298   364
## 5          99999   244
## 6           3103   152
## 7           5178   146
## 8           5013   117
## 9           4386   108
## 10          8614    82
## 11          3325    81
## 12          2174    74
## 13         10520    64
## 14          4650    63
## 15         27828    58
## 16          4064    54
## 17           594    52
## 18          3137    51
## 19         14084    49
## 20         20051    49
## 21          2829    42
## 22          3908    42
## 23          6849    42
## 24         13550    42
## 25          1055    37
## 26          4787    35
## 27          3411    34
## 28         14344    34
## 29          3464    33
## 30          2176    31
## 31          2597    31
## 32          9386    31
## 33          2885    30
## 34          4101    29
## 35          2202    28
## 36          2407    25
## 37          4865    25
## 38          1506    24
## 39          4416    24
## 40          4508    23
## 41          3674    22
## 42          2354    21
## 43          2580    20
## 44         10605    19
## 45          2907    18
## 46          3942    18
## 47          5455    18
## 48          3781    16
## 49          6418    16
## 50          2105    15
## 51          2463    15
## 52          6497    15
## 53          7430    15
## 54          2635    14
## 55          2964    14
## 56         25236    14
## 57          1151    13
## 58          2653    11
## 59          2977    11
## 60          3471    11
## 61          3818    11
## 62           914    10
## 63          1409    10
## 64          1797    10
## 65          2290    10
## 66          2414    10
## 67          4934    10
## 68          6514    10
## 69         15020    10
## 70          1471     9
## 71          1831     9
## 72          1848     9
## 73           114     8
## 74          1086     8
## 75          2346     8
## 76          3418     8
## 77          3887     8
## 78         10566     8
## 79         15831     8
## 80          2329     7
## 81          3273     7
## 82          5721     7
## 83          7443     7
## 84           991     6
## 85          3456     6
## 86          5556     6
## 87          6767     6
## 88         25124     6
## 89         34095     6
## 90           401     5
## 91          1173     5
## 92          2036     5
## 93          2050     5
## 94          2228     5
## 95          2538     5
## 96          6723     5
## 97          9562     5
## 98          1424     4
## 99          1455     4
## 100         2936     4
## 101         2961     4
## 102         3432     4
## 103         4687     4
## 104         4931     4
## 105         7896     4
## 106        11678     4
## 107         2009     3
## 108         2062     3
## 109         2993     3
## 110         6360     3
## 111        41310     3
## 112         1264     2
## 113         5060     2
## 114         6097     2
## 115         7978     2
## 116        18481     2
## 117         1111     1
## 118         1639     1
## 119         1731     1
## 120         2387     1
## 121         6612     1
## 122         7262     1
## 123        22040     1
ovr50 %>% count(capital_loss, sort = TRUE) #left skewed with mainly 0's
##    capital_loss     n
## 1             0 46560
## 2          1902   304
## 3          1977   253
## 4          1887   233
## 5          2415    72
## 6          1485    71
## 7          1848    67
## 8          1590    62
## 9          1602    62
## 10         1876    59
## 11         1740    58
## 12         1672    50
## 13         1741    44
## 14         1564    43
## 15         2258    39
## 16         1719    38
## 17         1980    36
## 18         1408    35
## 19         1669    35
## 20         2001    35
## 21         2002    33
## 22         1579    30
## 23         2051    29
## 24         1721    28
## 25         1974    28
## 26         2339    27
## 27         1504    26
## 28         2377    25
## 29         1628    24
## 30         1762    20
## 31         2179    20
## 32         2444    20
## 33         2205    19
## 34          625    17
## 35         2559    17
## 36         2057    16
## 37         2824    14
## 38         1573    12
## 39         2042    12
## 40         1092    11
## 41         1340    11
## 42         1617    11
## 43         1651    11
## 44         2392    11
## 45         1380    10
## 46         2174    10
## 47         1594     9
## 48         1668     9
## 49         1726     9
## 50         2246     8
## 51         2129     7
## 52         2231     7
## 53         2603     7
## 54          880     6
## 55         1258     6
## 56         2206     6
## 57          213     5
## 58          323     5
## 59         1825     5
## 60         2149     5
## 61         2547     5
## 62         3004     5
## 63          653     4
## 64         1138     4
## 65         1411     4
## 66         1816     4
## 67         2238     4
## 68         2457     4
## 69         2472     4
## 70         3770     4
## 71          419     3
## 72         1429     3
## 73         1510     3
## 74         1648     3
## 75         1735     3
## 76         1844     3
## 77         1944     3
## 78         2267     3
## 79         4356     3
## 80          810     2
## 81          974     2
## 82         1755     2
## 83         2163     2
## 84         2282     2
## 85         2352     2
## 86         2467     2
## 87         2754     2
## 88         3175     2
## 89         3683     2
## 90         3900     2
## 91          155     1
## 92         1421     1
## 93         1539     1
## 94         1870     1
## 95         1911     1
## 96         2080     1
## 97         2201     1
## 98         2465     1
## 99         2489     1
ovr50 %>% count(hours_week, sort = TRUE)
##    hours_week     n
## 1          40 22803
## 2          50  4246
## 3          45  2717
## 4          60  2177
## 5          35  1937
## 6          20  1862
## 7          30  1700
## 8          55  1051
## 9          25   958
## 10         48   770
## 11         38   714
## 12         15   623
## 13         70   437
## 14         10   425
## 15         32   423
## 16         65   355
## 17         24   354
## 18         42   338
## 19         36   336
## 20         44   310
## 21         16   303
## 22         12   247
## 23         37   242
## 24         43   227
## 25          8   218
## 26         80   210
## 27         52   205
## 28         56   141
## 29         28   140
## 30         99   137
## 31         18   129
## 32         46   129
## 33         72   107
## 34         75   105
## 35          5    95
## 36          6    92
## 37          4    84
## 38         47    82
## 39         84    72
## 40         39    63
## 41         22    62
## 42         54    62
## 43         33    61
## 44          3    59
## 45         41    59
## 46         14    55
## 47          2    53
## 48         34    48
## 49         21    46
## 50          7    45
## 51         27    43
## 52         17    42
## 53         90    42
## 54         23    40
## 55         26    40
## 56         49    39
## 57         53    39
## 58         58    38
## 59         13    28
## 60          1    27
## 61          9    27
## 62         62    23
## 63         66    23
## 64         64    22
## 65         11    20
## 66         51    20
## 67         19    19
## 68         57    19
## 69         85    17
## 70         68    16
## 71         29    15
## 72         63    15
## 73         98    14
## 74         78    13
## 75         31    12
## 76         77     9
## 77         96     9
## 78         59     7
## 79         67     6
## 80         61     4
## 81         73     4
## 82         76     4
## 83         86     4
## 84         88     4
## 85         74     3
## 86         81     3
## 87         89     3
## 88         91     3
## 89         92     3
## 90         95     2
## 91         97     2
## 92         69     1
## 93         79     1
## 94         82     1
## 95         87     1
## 96         94     1
ovr50 %>% count(country, sort = TRUE) # has ? values
##                       country     n
## 1               United-States 43832
## 2                      Mexico   951
## 3                           ?   857
## 4                 Philippines   295
## 5                     Germany   206
## 6                 Puerto-Rico   184
## 7                      Canada   182
## 8                 El-Salvador   155
## 9                       India   151
## 10                       Cuba   138
## 11                    England   127
## 12                      China   122
## 13                      South   115
## 14                    Jamaica   106
## 15                      Italy   105
## 16         Dominican-Republic   103
## 17                      Japan    92
## 18                  Guatemala    88
## 19                     Poland    87
## 20                    Vietnam    86
## 21                   Columbia    85
## 22                      Haiti    75
## 23                   Portugal    67
## 24                     Taiwan    65
## 25                       Iran    59
## 26                     Greece    49
## 27                  Nicaragua    49
## 28                       Peru    46
## 29                    Ecuador    45
## 30                     France    38
## 31                    Ireland    37
## 32                       Hong    30
## 33                   Thailand    30
## 34                   Cambodia    28
## 35            Trinadad&Tobago    27
## 36                       Laos    23
## 37 Outlying-US(Guam-USVI-etc)    23
## 38                 Yugoslavia    23
## 39                   Scotland    21
## 40                   Honduras    20
## 41                    Hungary    19
## 42         Holand-Netherlands     1
ovr50 %>% count(over_50k, sort = TRUE) # target
##   over_50k     n
## 1        0 37155
## 2        1 11687
#5 number summary for continuous variables
summary(ovr50$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   17.00   28.00   37.00   38.64   48.00   90.00
summary(ovr50$education_num)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    9.00   10.00   10.08   12.00   16.00
summary(ovr50$capital_gain)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0    1079       0   99999
summary(ovr50$capital_loss)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.0    87.5     0.0  4356.0
summary(ovr50$hours_week)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   40.00   40.00   40.42   45.00   99.00

Exploring the data part 2

#Looking at statistical tests for significance

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |              Expected N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  48842 
## 
##  
##                | ovr50$workclass 
## ovr50$over_50k |                ? |      Federal-gov |        Local-gov |     Never-worked |          Private |     Self-emp-inc | Self-emp-not-inc |        State-gov |      Without-pay |        Row Total | 
## ---------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
##              0 |             2534 |              871 |             2209 |               10 |            26519 |              757 |             2785 |             1451 |               19 |            37155 | 
##                |         2129.250 |         1089.349 |         2385.612 |            7.607 |        25792.912 |         1289.417 |         2937.894 |         1506.983 |           15.975 |                  | 
##                |           76.939 |           43.766 |           13.075 |            0.753 |           20.440 |          219.842 |            7.957 |            2.080 |            0.573 |                  | 
##                |            0.068 |            0.023 |            0.059 |            0.000 |            0.714 |            0.020 |            0.075 |            0.039 |            0.001 |            0.761 | 
##                |            0.905 |            0.608 |            0.704 |            1.000 |            0.782 |            0.447 |            0.721 |            0.732 |            0.905 |                  | 
##                |            0.052 |            0.018 |            0.045 |            0.000 |            0.543 |            0.015 |            0.057 |            0.030 |            0.000 |                  | 
## ---------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
##              1 |              265 |              561 |              927 |                0 |             7387 |              938 |             1077 |              530 |                2 |            11687 | 
##                |          669.750 |          342.651 |          750.388 |            2.393 |         8113.088 |          405.583 |          924.106 |          474.017 |            5.025 |                  | 
##                |          244.602 |          139.139 |           41.568 |            2.393 |           64.982 |          698.916 |           25.296 |            6.612 |            1.821 |                  | 
##                |            0.023 |            0.048 |            0.079 |            0.000 |            0.632 |            0.080 |            0.092 |            0.045 |            0.000 |            0.239 | 
##                |            0.095 |            0.392 |            0.296 |            0.000 |            0.218 |            0.553 |            0.279 |            0.268 |            0.095 |                  | 
##                |            0.005 |            0.011 |            0.019 |            0.000 |            0.151 |            0.019 |            0.022 |            0.011 |            0.000 |                  | 
## ---------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
##   Column Total |             2799 |             1432 |             3136 |               10 |            33906 |             1695 |             3862 |             1981 |               21 |            48842 | 
##                |            0.057 |            0.029 |            0.064 |            0.000 |            0.694 |            0.035 |            0.079 |            0.041 |            0.000 |                  | 
## ---------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|------------------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  1610.752     d.f. =  8     p =  0 
## 
## 
## 
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |              Expected N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  48842 
## 
##  
##                | ovr50$education_level 
## ovr50$over_50k |         10th |         11th |         12th |      1st-4th |      5th-6th |      7th-8th |          9th |   Assoc-acdm |    Assoc-voc |    Bachelors |    Doctorate |      HS-grad |      Masters |    Preschool |  Prof-school | Some-college |    Row Total | 
## ---------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|
##              0 |         1302 |         1720 |          609 |          239 |          482 |          893 |          715 |         1188 |         1539 |         4712 |          163 |        13281 |         1198 |           82 |          217 |         8815 |        37155 | 
##                |     1056.638 |     1378.421 |      499.792 |      187.897 |      387.206 |      726.486 |      575.103 |     1217.910 |     1567.840 |     6104.764 |      451.867 |    12007.177 |     2021.228 |       63.140 |      634.439 |     8275.093 |              | 
##                |       56.976 |       84.645 |       23.863 |       13.898 |       23.207 |       38.166 |       34.031 |        0.735 |        0.531 |      317.750 |      184.665 |      135.138 |      335.294 |        5.634 |      274.660 |       35.226 |              | 
##                |        0.035 |        0.046 |        0.016 |        0.006 |        0.013 |        0.024 |        0.019 |        0.032 |        0.041 |        0.127 |        0.004 |        0.357 |        0.032 |        0.002 |        0.006 |        0.237 |        0.761 | 
##                |        0.937 |        0.949 |        0.927 |        0.968 |        0.947 |        0.935 |        0.946 |        0.742 |        0.747 |        0.587 |        0.274 |        0.841 |        0.451 |        0.988 |        0.260 |        0.810 |              | 
##                |        0.027 |        0.035 |        0.012 |        0.005 |        0.010 |        0.018 |        0.015 |        0.024 |        0.032 |        0.096 |        0.003 |        0.272 |        0.025 |        0.002 |        0.004 |        0.180 |              | 
## ---------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|
##              1 |           87 |           92 |           48 |            8 |           27 |           62 |           41 |          413 |          522 |         3313 |          431 |         2503 |         1459 |            1 |          617 |         2063 |        11687 | 
##                |      332.362 |      433.579 |      157.208 |       59.103 |      121.794 |      228.514 |      180.897 |      383.090 |      493.160 |     1920.236 |      142.133 |     3776.823 |      635.772 |       19.860 |      199.561 |     2602.907 |              | 
##                |      181.136 |      269.100 |       75.864 |       44.185 |       73.780 |      121.336 |      108.190 |        2.335 |        1.687 |     1010.184 |      587.082 |      429.627 |     1065.956 |       17.911 |      873.193 |      111.990 |              | 
##                |        0.007 |        0.008 |        0.004 |        0.001 |        0.002 |        0.005 |        0.004 |        0.035 |        0.045 |        0.283 |        0.037 |        0.214 |        0.125 |        0.000 |        0.053 |        0.177 |        0.239 | 
##                |        0.063 |        0.051 |        0.073 |        0.032 |        0.053 |        0.065 |        0.054 |        0.258 |        0.253 |        0.413 |        0.726 |        0.159 |        0.549 |        0.012 |        0.740 |        0.190 |              | 
##                |        0.002 |        0.002 |        0.001 |        0.000 |        0.001 |        0.001 |        0.001 |        0.008 |        0.011 |        0.068 |        0.009 |        0.051 |        0.030 |        0.000 |        0.013 |        0.042 |              | 
## ---------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|
##   Column Total |         1389 |         1812 |          657 |          247 |          509 |          955 |          756 |         1601 |         2061 |         8025 |          594 |        15784 |         2657 |           83 |          834 |        10878 |        48842 | 
##                |        0.028 |        0.037 |        0.013 |        0.005 |        0.010 |        0.020 |        0.015 |        0.033 |        0.042 |        0.164 |        0.012 |        0.323 |        0.054 |        0.002 |        0.017 |        0.223 |              | 
## ---------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|--------------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  6537.973     d.f. =  15     p =  0 
## 
## 
## 
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |              Expected N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  48842 
## 
##  
##                | ovr50$marital_status 
## ovr50$over_50k |              Divorced |     Married-AF-spouse |    Married-civ-spouse | Married-spouse-absent |         Never-married |             Separated |               Widowed |             Row Total | 
## ---------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|
##              0 |                  5962 |                    23 |                 12395 |                   570 |                 15384 |                  1431 |                  1390 |                 37155 | 
##                |              5045.844 |                28.147 |             17024.113 |               477.731 |             12260.496 |              1163.899 |              1154.770 |                       | 
##                |               166.343 |                 0.941 |              1258.726 |                17.821 |               795.749 |                61.297 |                47.917 |                       | 
##                |                 0.160 |                 0.001 |                 0.334 |                 0.015 |                 0.414 |                 0.039 |                 0.037 |                 0.761 | 
##                |                 0.899 |                 0.622 |                 0.554 |                 0.908 |                 0.955 |                 0.935 |                 0.916 |                       | 
##                |                 0.122 |                 0.000 |                 0.254 |                 0.012 |                 0.315 |                 0.029 |                 0.028 |                       | 
## ---------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|
##              1 |                   671 |                    14 |                  9984 |                    58 |                   733 |                    99 |                   128 |                 11687 | 
##                |              1587.156 |                 8.853 |              5354.887 |               150.269 |              3856.504 |               366.101 |               363.230 |                       | 
##                |               528.834 |                 2.992 |              4001.708 |                56.655 |              2529.824 |               194.872 |               152.336 |                       | 
##                |                 0.057 |                 0.001 |                 0.854 |                 0.005 |                 0.063 |                 0.008 |                 0.011 |                 0.239 | 
##                |                 0.101 |                 0.378 |                 0.446 |                 0.092 |                 0.045 |                 0.065 |                 0.084 |                       | 
##                |                 0.014 |                 0.000 |                 0.204 |                 0.001 |                 0.015 |                 0.002 |                 0.003 |                       | 
## ---------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|
##   Column Total |                  6633 |                    37 |                 22379 |                   628 |                 16117 |                  1530 |                  1518 |                 48842 | 
##                |                 0.136 |                 0.001 |                 0.458 |                 0.013 |                 0.330 |                 0.031 |                 0.031 |                       | 
## ---------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|-----------------------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  9816.015     d.f. =  6     p =  0 
## 
## 
## 
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |              Expected N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  48842 
## 
##  
##                | ovr50$occupation 
## ovr50$over_50k |                 ? |      Adm-clerical |      Armed-Forces |      Craft-repair |   Exec-managerial |   Farming-fishing | Handlers-cleaners | Machine-op-inspct |     Other-service |   Priv-house-serv |    Prof-specialty |   Protective-serv |             Sales |      Tech-support |  Transport-moving |         Row Total | 
## ---------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|
##              0 |              2544 |              4843 |                10 |              4729 |              3178 |              1317 |              1934 |              2650 |              4719 |               239 |              3388 |               675 |              4029 |              1026 |              1874 |             37155 | 
##                |          2136.858 |          4268.390 |            11.411 |          4649.510 |          4629.731 |          1133.470 |          1576.208 |          2298.891 |          3745.016 |           184.094 |          4695.153 |           747.786 |          4186.993 |          1099.999 |          1791.491 |                   | 
##                |            77.574 |            77.354 |             0.174 |             1.359 |           455.215 |            29.717 |            81.217 |            53.625 |           253.309 |            16.376 |           363.918 |             7.085 |             5.962 |             4.978 |             3.800 |                   | 
##                |             0.068 |             0.130 |             0.000 |             0.127 |             0.086 |             0.035 |             0.052 |             0.071 |             0.127 |             0.006 |             0.091 |             0.018 |             0.108 |             0.028 |             0.050 |             0.761 | 
##                |             0.906 |             0.863 |             0.667 |             0.774 |             0.522 |             0.884 |             0.933 |             0.877 |             0.959 |             0.988 |             0.549 |             0.687 |             0.732 |             0.710 |             0.796 |                   | 
##                |             0.052 |             0.099 |             0.000 |             0.097 |             0.065 |             0.027 |             0.040 |             0.054 |             0.097 |             0.005 |             0.069 |             0.014 |             0.082 |             0.021 |             0.038 |                   | 
## ---------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|
##              1 |               265 |               768 |                 5 |              1383 |              2908 |               173 |               138 |               372 |               204 |                 3 |              2784 |               308 |              1475 |               420 |               481 |             11687 | 
##                |           672.142 |          1342.610 |             3.589 |          1462.490 |          1456.269 |           356.530 |           495.792 |           723.109 |          1177.984 |            57.906 |          1476.847 |           235.214 |          1317.007 |           346.001 |           563.509 |                   | 
##                |           246.622 |           245.921 |             0.555 |             4.320 |          1447.208 |            94.475 |           258.203 |           170.483 |           805.312 |            52.062 |          1156.957 |            22.523 |            18.953 |            15.826 |            12.081 |                   | 
##                |             0.023 |             0.066 |             0.000 |             0.118 |             0.249 |             0.015 |             0.012 |             0.032 |             0.017 |             0.000 |             0.238 |             0.026 |             0.126 |             0.036 |             0.041 |             0.239 | 
##                |             0.094 |             0.137 |             0.333 |             0.226 |             0.478 |             0.116 |             0.067 |             0.123 |             0.041 |             0.012 |             0.451 |             0.313 |             0.268 |             0.290 |             0.204 |                   | 
##                |             0.005 |             0.016 |             0.000 |             0.028 |             0.060 |             0.004 |             0.003 |             0.008 |             0.004 |             0.000 |             0.057 |             0.006 |             0.030 |             0.009 |             0.010 |                   | 
## ---------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|
##   Column Total |              2809 |              5611 |                15 |              6112 |              6086 |              1490 |              2072 |              3022 |              4923 |               242 |              6172 |               983 |              5504 |              1446 |              2355 |             48842 | 
##                |             0.058 |             0.115 |             0.000 |             0.125 |             0.125 |             0.031 |             0.042 |             0.062 |             0.101 |             0.005 |             0.126 |             0.020 |             0.113 |             0.030 |             0.048 |                   | 
## ---------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  5983.164     d.f. =  14     p =  0 
## 
## 
## 
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |              Expected N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  48842 
## 
##  
##                | ovr50$relationship 
## ovr50$over_50k |        Husband |  Not-in-family | Other-relative |      Own-child |      Unmarried |           Wife |      Row Total | 
## ---------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|
##              0 |          10870 |          11307 |           1454 |           7470 |           4816 |           1238 |          37155 | 
##                |      14998.321 |       9572.118 |       1145.642 |       5767.005 |       3898.681 |       1773.234 |                | 
##                |       1136.329 |        314.436 |         82.997 |        502.894 |        215.836 |        161.555 |                | 
##                |          0.293 |          0.304 |          0.039 |          0.201 |          0.130 |          0.033 |          0.761 | 
##                |          0.551 |          0.899 |          0.965 |          0.985 |          0.940 |          0.531 |                | 
##                |          0.223 |          0.232 |          0.030 |          0.153 |          0.099 |          0.025 |                | 
## ---------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|
##              1 |           8846 |           1276 |             52 |            111 |            309 |           1093 |          11687 | 
##                |       4717.679 |       3010.882 |        360.358 |       1813.995 |       1226.319 |        557.766 |                | 
##                |       3612.588 |        999.646 |        263.862 |       1598.787 |        686.179 |        513.613 |                | 
##                |          0.757 |          0.109 |          0.004 |          0.009 |          0.026 |          0.094 |          0.239 | 
##                |          0.449 |          0.101 |          0.035 |          0.015 |          0.060 |          0.469 |                | 
##                |          0.181 |          0.026 |          0.001 |          0.002 |          0.006 |          0.022 |                | 
## ---------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|
##   Column Total |          19716 |          12583 |           1506 |           7581 |           5125 |           2331 |          48842 | 
##                |          0.404 |          0.258 |          0.031 |          0.155 |          0.105 |          0.048 |                | 
## ---------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  10088.72     d.f. =  5     p =  0 
## 
## 
## 
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |              Expected N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  48842 
## 
##  
##                | ovr50$race 
## ovr50$over_50k | Amer-Indian-Eskimo | Asian-Pac-Islander |              Black |              Other |              White |          Row Total | 
## ---------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##              0 |                415 |               1110 |               4119 |                356 |              31155 |              37155 | 
##                |            357.538 |           1155.531 |           3563.965 |            308.852 |          31769.115 |                    | 
##                |              9.235 |              1.794 |             86.439 |              7.198 |             11.871 |                    | 
##                |              0.011 |              0.030 |              0.111 |              0.010 |              0.839 |              0.761 | 
##                |              0.883 |              0.731 |              0.879 |              0.877 |              0.746 |                    | 
##                |              0.008 |              0.023 |              0.084 |              0.007 |              0.638 |                    | 
## ---------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##              1 |                 55 |                409 |                566 |                 50 |              10607 |              11687 | 
##                |            112.462 |            363.469 |           1121.035 |             97.148 |           9992.885 |                    | 
##                |             29.360 |              5.704 |            274.803 |             22.882 |             37.741 |                    | 
##                |              0.005 |              0.035 |              0.048 |              0.004 |              0.908 |              0.239 | 
##                |              0.117 |              0.269 |              0.121 |              0.123 |              0.254 |                    | 
##                |              0.001 |              0.008 |              0.012 |              0.001 |              0.217 |                    | 
## ---------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
##   Column Total |                470 |               1519 |               4685 |                406 |              41762 |              48842 | 
##                |              0.010 |              0.031 |              0.096 |              0.008 |              0.855 |                    | 
## ---------------|--------------------|--------------------|--------------------|--------------------|--------------------|--------------------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  487.0263     d.f. =  4     p =  4.284378e-104 
## 
## 
## 
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |              Expected N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  48842 
## 
##  
##                | ovr50$sex 
## ovr50$over_50k |    Female |      Male | Row Total | 
## ---------------|-----------|-----------|-----------|
##              0 |     14423 |     22732 |     37155 | 
##                | 12317.550 | 24837.450 |           | 
##                |   359.887 |   178.477 |           | 
##                |     0.388 |     0.612 |     0.761 | 
##                |     0.891 |     0.696 |           | 
##                |     0.295 |     0.465 |           | 
## ---------------|-----------|-----------|-----------|
##              1 |      1769 |      9918 |     11687 | 
##                |  3874.450 |  7812.550 |           | 
##                |  1144.142 |   567.410 |           | 
##                |     0.151 |     0.849 |     0.239 | 
##                |     0.109 |     0.304 |           | 
##                |     0.036 |     0.203 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |     16192 |     32650 |     48842 | 
##                |     0.332 |     0.668 |           | 
## ---------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  2249.916     d.f. =  1     p =  0 
## 
## Pearson's Chi-squared test with Yates' continuity correction 
## ------------------------------------------------------------
## Chi^2 =  2248.848     d.f. =  1     p =  0 
## 
## 
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |              Expected N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  48842 
## 
##  
##                | ovr50$country 
## ovr50$over_50k |                          ? |                   Cambodia |                     Canada |                      China |                   Columbia |                       Cuba |         Dominican-Republic |                    Ecuador |                El-Salvador |                    England |                     France |                    Germany |                     Greece |                  Guatemala |                      Haiti |         Holand-Netherlands |                   Honduras |                       Hong |                    Hungary |                      India |                       Iran |                    Ireland |                      Italy |                    Jamaica |                      Japan |                       Laos |                     Mexico |                  Nicaragua | Outlying-US(Guam-USVI-etc) |                       Peru |                Philippines |                     Poland |                   Portugal |                Puerto-Rico |                   Scotland |                      South |                     Taiwan |                   Thailand |            Trinadad&Tobago |              United-States |                    Vietnam |                 Yugoslavia |                  Row Total | 
## ---------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|
##              0 |                        637 |                         19 |                        119 |                         86 |                         81 |                        104 |                         98 |                         39 |                        144 |                         80 |                         22 |                        148 |                         31 |                         85 |                         66 |                          1 |                         18 |                         22 |                         13 |                         89 |                         37 |                         26 |                         71 |                         91 |                         60 |                         21 |                        904 |                         46 |                         22 |                         42 |                        210 |                         70 |                         55 |                        164 |                         18 |                         95 |                         39 |                         25 |                         25 |                      33138 |                         79 |                         15 |                      37155 | 
##                |                    651.936 |                     21.300 |                    138.451 |                     92.808 |                     64.661 |                    104.979 |                     78.354 |                     34.232 |                    117.911 |                     96.611 |                     28.907 |                    156.708 |                     37.275 |                     66.943 |                     57.054 |                      0.761 |                     15.214 |                     22.822 |                     14.454 |                    114.868 |                     44.882 |                     28.147 |                     79.875 |                     80.636 |                     69.986 |                     17.497 |                    723.443 |                     37.275 |                     17.497 |                     34.993 |                    224.412 |                     66.182 |                     50.968 |                    139.972 |                     15.975 |                     87.483 |                     49.447 |                     22.822 |                     20.539 |                  33343.802 |                     65.422 |                     17.497 |                            | 
##                |                      0.342 |                      0.248 |                      2.733 |                      0.499 |                      4.129 |                      0.009 |                      4.926 |                      0.664 |                      5.772 |                      2.856 |                      1.650 |                      0.484 |                      1.056 |                      4.871 |                      1.403 |                      0.075 |                      0.510 |                      0.030 |                      0.146 |                      5.826 |                      1.384 |                      0.164 |                      0.986 |                      1.332 |                      1.425 |                      0.702 |                     45.063 |                      2.042 |                      1.159 |                      1.403 |                      0.926 |                      0.220 |                      0.319 |                      4.125 |                      0.257 |                      0.646 |                      2.207 |                      0.208 |                      0.969 |                      1.270 |                      2.818 |                      0.356 |                            | 
##                |                      0.017 |                      0.001 |                      0.003 |                      0.002 |                      0.002 |                      0.003 |                      0.003 |                      0.001 |                      0.004 |                      0.002 |                      0.001 |                      0.004 |                      0.001 |                      0.002 |                      0.002 |                      0.000 |                      0.000 |                      0.001 |                      0.000 |                      0.002 |                      0.001 |                      0.001 |                      0.002 |                      0.002 |                      0.002 |                      0.001 |                      0.024 |                      0.001 |                      0.001 |                      0.001 |                      0.006 |                      0.002 |                      0.001 |                      0.004 |                      0.000 |                      0.003 |                      0.001 |                      0.001 |                      0.001 |                      0.892 |                      0.002 |                      0.000 |                      0.761 | 
##                |                      0.743 |                      0.679 |                      0.654 |                      0.705 |                      0.953 |                      0.754 |                      0.951 |                      0.867 |                      0.929 |                      0.630 |                      0.579 |                      0.718 |                      0.633 |                      0.966 |                      0.880 |                      1.000 |                      0.900 |                      0.733 |                      0.684 |                      0.589 |                      0.627 |                      0.703 |                      0.676 |                      0.858 |                      0.652 |                      0.913 |                      0.951 |                      0.939 |                      0.957 |                      0.913 |                      0.712 |                      0.805 |                      0.821 |                      0.891 |                      0.857 |                      0.826 |                      0.600 |                      0.833 |                      0.926 |                      0.756 |                      0.919 |                      0.652 |                            | 
##                |                      0.013 |                      0.000 |                      0.002 |                      0.002 |                      0.002 |                      0.002 |                      0.002 |                      0.001 |                      0.003 |                      0.002 |                      0.000 |                      0.003 |                      0.001 |                      0.002 |                      0.001 |                      0.000 |                      0.000 |                      0.000 |                      0.000 |                      0.002 |                      0.001 |                      0.001 |                      0.001 |                      0.002 |                      0.001 |                      0.000 |                      0.019 |                      0.001 |                      0.000 |                      0.001 |                      0.004 |                      0.001 |                      0.001 |                      0.003 |                      0.000 |                      0.002 |                      0.001 |                      0.001 |                      0.001 |                      0.678 |                      0.002 |                      0.000 |                            | 
## ---------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|
##              1 |                        220 |                          9 |                         63 |                         36 |                          4 |                         34 |                          5 |                          6 |                         11 |                         47 |                         16 |                         58 |                         18 |                          3 |                          9 |                          0 |                          2 |                          8 |                          6 |                         62 |                         22 |                         11 |                         34 |                         15 |                         32 |                          2 |                         47 |                          3 |                          1 |                          4 |                         85 |                         17 |                         12 |                         20 |                          3 |                         20 |                         26 |                          5 |                          2 |                      10694 |                          7 |                          8 |                      11687 | 
##                |                    205.064 |                      6.700 |                     43.549 |                     29.192 |                     20.339 |                     33.021 |                     24.646 |                     10.768 |                     37.089 |                     30.389 |                      9.093 |                     49.292 |                     11.725 |                     21.057 |                     17.946 |                      0.239 |                      4.786 |                      7.178 |                      4.546 |                     36.132 |                     14.118 |                      8.853 |                     25.125 |                     25.364 |                     22.014 |                      5.503 |                    227.557 |                     11.725 |                      5.503 |                     11.007 |                     70.588 |                     20.818 |                     16.032 |                     44.028 |                      5.025 |                     27.517 |                     15.553 |                      7.178 |                      6.461 |                  10488.198 |                     20.578 |                      5.503 |                            | 
##                |                      1.088 |                      0.790 |                      8.687 |                      1.588 |                     13.126 |                      0.029 |                     15.660 |                      2.111 |                     18.351 |                      9.080 |                      5.247 |                      1.538 |                      3.359 |                     15.484 |                      4.460 |                      0.239 |                      1.621 |                      0.094 |                      0.465 |                     18.521 |                      4.401 |                      0.520 |                      3.135 |                      4.235 |                      4.530 |                      2.230 |                    143.264 |                      6.492 |                      3.685 |                      4.461 |                      2.942 |                      0.700 |                      1.014 |                     13.113 |                      0.816 |                      2.054 |                      7.017 |                      0.661 |                      3.080 |                      4.038 |                      8.959 |                      1.132 |                            | 
##                |                      0.019 |                      0.001 |                      0.005 |                      0.003 |                      0.000 |                      0.003 |                      0.000 |                      0.001 |                      0.001 |                      0.004 |                      0.001 |                      0.005 |                      0.002 |                      0.000 |                      0.001 |                      0.000 |                      0.000 |                      0.001 |                      0.001 |                      0.005 |                      0.002 |                      0.001 |                      0.003 |                      0.001 |                      0.003 |                      0.000 |                      0.004 |                      0.000 |                      0.000 |                      0.000 |                      0.007 |                      0.001 |                      0.001 |                      0.002 |                      0.000 |                      0.002 |                      0.002 |                      0.000 |                      0.000 |                      0.915 |                      0.001 |                      0.001 |                      0.239 | 
##                |                      0.257 |                      0.321 |                      0.346 |                      0.295 |                      0.047 |                      0.246 |                      0.049 |                      0.133 |                      0.071 |                      0.370 |                      0.421 |                      0.282 |                      0.367 |                      0.034 |                      0.120 |                      0.000 |                      0.100 |                      0.267 |                      0.316 |                      0.411 |                      0.373 |                      0.297 |                      0.324 |                      0.142 |                      0.348 |                      0.087 |                      0.049 |                      0.061 |                      0.043 |                      0.087 |                      0.288 |                      0.195 |                      0.179 |                      0.109 |                      0.143 |                      0.174 |                      0.400 |                      0.167 |                      0.074 |                      0.244 |                      0.081 |                      0.348 |                            | 
##                |                      0.005 |                      0.000 |                      0.001 |                      0.001 |                      0.000 |                      0.001 |                      0.000 |                      0.000 |                      0.000 |                      0.001 |                      0.000 |                      0.001 |                      0.000 |                      0.000 |                      0.000 |                      0.000 |                      0.000 |                      0.000 |                      0.000 |                      0.001 |                      0.000 |                      0.000 |                      0.001 |                      0.000 |                      0.001 |                      0.000 |                      0.001 |                      0.000 |                      0.000 |                      0.000 |                      0.002 |                      0.000 |                      0.000 |                      0.000 |                      0.000 |                      0.000 |                      0.001 |                      0.000 |                      0.000 |                      0.219 |                      0.000 |                      0.000 |                            | 
## ---------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|
##   Column Total |                        857 |                         28 |                        182 |                        122 |                         85 |                        138 |                        103 |                         45 |                        155 |                        127 |                         38 |                        206 |                         49 |                         88 |                         75 |                          1 |                         20 |                         30 |                         19 |                        151 |                         59 |                         37 |                        105 |                        106 |                         92 |                         23 |                        951 |                         49 |                         23 |                         46 |                        295 |                         87 |                         67 |                        184 |                         21 |                        115 |                         65 |                         30 |                         27 |                      43832 |                         86 |                         23 |                      48842 | 
##                |                      0.018 |                      0.001 |                      0.004 |                      0.002 |                      0.002 |                      0.003 |                      0.002 |                      0.001 |                      0.003 |                      0.003 |                      0.001 |                      0.004 |                      0.001 |                      0.002 |                      0.002 |                      0.000 |                      0.000 |                      0.001 |                      0.000 |                      0.003 |                      0.001 |                      0.001 |                      0.002 |                      0.002 |                      0.002 |                      0.000 |                      0.019 |                      0.001 |                      0.000 |                      0.001 |                      0.006 |                      0.002 |                      0.001 |                      0.004 |                      0.000 |                      0.002 |                      0.001 |                      0.001 |                      0.001 |                      0.897 |                      0.002 |                      0.000 |                            | 
## ---------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|----------------------------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  452.229     d.f. =  41     p =  1.035618e-70 
## 
## 
## 
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  2000 replicates)
## 
## data:  table(ovr50$occupation, ovr50$over_50k)
## p-value = 0.0004998
## alternative hypothesis: two.sided
## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  2000 replicates)
## 
## data:  table(ovr50$country, ovr50$over_50k)
## p-value = 0.0004998
## alternative hypothesis: two.sided
## 
## Call:
## glm(formula = over_50k ~ education_num, family = binomial(link = "logit"), 
##     data = ovr50)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5258  -0.6690  -0.5674  -0.1984   3.0501  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -5.004159   0.057814  -86.56   <2e-16 ***
## education_num  0.362116   0.005128   70.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 53751  on 48841  degrees of freedom
## Residual deviance: 47775  on 48840  degrees of freedom
## AIC: 47779
## 
## Number of Fisher Scoring iterations: 4
## 
## Call:
## glm(formula = over_50k ~ capital_gain, family = binomial(link = "logit"), 
##     data = ovr50)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.0178  -0.6684  -0.6684  -0.6684   1.7936  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.385e+00  1.168e-02 -118.56   <2e-16 ***
## capital_gain  3.383e-04  7.114e-06   47.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 53751  on 48841  degrees of freedom
## Residual deviance: 48700  on 48840  degrees of freedom
## AIC: 48704
## 
## Number of Fisher Scoring iterations: 6
## 
## Call:
## glm(formula = over_50k ~ capital_loss, family = binomial(link = "logit"), 
##     data = ovr50)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9769  -0.7151  -0.7151  -0.7151   1.7256  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.233e+00  1.107e-02 -111.34   <2e-16 ***
## capital_loss  6.966e-04  2.294e-05   30.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 53751  on 48841  degrees of freedom
## Residual deviance: 52849  on 48840  degrees of freedom
## AIC: 52853
## 
## Number of Fisher Scoring iterations: 4
## 
## Call:
## glm(formula = over_50k ~ hours_week, family = binomial(link = "logit"), 
##     data = ovr50)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8251  -0.7103  -0.7103  -0.3580   2.4832  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.0821209  0.0429865  -71.70   <2e-16 ***
## hours_week   0.0458377  0.0009623   47.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 53751  on 48841  degrees of freedom
## Residual deviance: 51160  on 48840  degrees of freedom
## AIC: 51164
## 
## Number of Fisher Scoring iterations: 4

Exploring the data part 3

#Visualizing the variables

##To visualize each variable.
  
  #First Categorical Visualizations (bar plots)
  #workclass, education_level, marital_status, occupation, relationship, race, sex, country
  
  #workclass
  df_hold <- ovr50 %>%
    group_by(over_50k,workclass)%>%
    summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
  df_hold
## # A tibble: 17 × 3
## # Groups:   over_50k [2]
##    over_50k workclass        count
##       <int> <chr>            <int>
##  1        0 ?                 2534
##  2        0 Federal-gov        871
##  3        0 Local-gov         2209
##  4        0 Never-worked        10
##  5        0 Private          26519
##  6        0 Self-emp-inc       757
##  7        0 Self-emp-not-inc  2785
##  8        0 State-gov         1451
##  9        0 Without-pay         19
## 10        1 ?                  265
## 11        1 Federal-gov        561
## 12        1 Local-gov          927
## 13        1 Private           7387
## 14        1 Self-emp-inc       938
## 15        1 Self-emp-not-inc  1077
## 16        1 State-gov          530
## 17        1 Without-pay          2
  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = over_50k, fill = factor(workclass)))

  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = workclass, fill = factor(over_50k)))

  #education_level
  df_hold <- ovr50 %>%
    group_by(over_50k,education_level)%>%
    summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
  df_hold
## # A tibble: 32 × 3
## # Groups:   over_50k [2]
##    over_50k education_level count
##       <int> <chr>           <int>
##  1        0 10th             1302
##  2        0 11th             1720
##  3        0 12th              609
##  4        0 1st-4th           239
##  5        0 5th-6th           482
##  6        0 7th-8th           893
##  7        0 9th               715
##  8        0 Assoc-acdm       1188
##  9        0 Assoc-voc        1539
## 10        0 Bachelors        4712
## # … with 22 more rows
  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = over_50k, fill = factor(education_level)))

  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = education_level, fill = factor(over_50k)))

  #marital_status 
  df_hold <- ovr50 %>%
    group_by(over_50k,marital_status)%>%
    summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
  df_hold
## # A tibble: 14 × 3
## # Groups:   over_50k [2]
##    over_50k marital_status        count
##       <int> <chr>                 <int>
##  1        0 Divorced               5962
##  2        0 Married-AF-spouse        23
##  3        0 Married-civ-spouse    12395
##  4        0 Married-spouse-absent   570
##  5        0 Never-married         15384
##  6        0 Separated              1431
##  7        0 Widowed                1390
##  8        1 Divorced                671
##  9        1 Married-AF-spouse        14
## 10        1 Married-civ-spouse     9984
## 11        1 Married-spouse-absent    58
## 12        1 Never-married           733
## 13        1 Separated                99
## 14        1 Widowed                 128
  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = over_50k, fill = factor(marital_status)))

  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = marital_status, fill = factor(over_50k)))

  #occupation
  df_hold <- ovr50 %>%
    group_by(over_50k,occupation)%>%
    summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
  df_hold
## # A tibble: 30 × 3
## # Groups:   over_50k [2]
##    over_50k occupation        count
##       <int> <chr>             <int>
##  1        0 ?                  2544
##  2        0 Adm-clerical       4843
##  3        0 Armed-Forces         10
##  4        0 Craft-repair       4729
##  5        0 Exec-managerial    3178
##  6        0 Farming-fishing    1317
##  7        0 Handlers-cleaners  1934
##  8        0 Machine-op-inspct  2650
##  9        0 Other-service      4719
## 10        0 Priv-house-serv     239
## # … with 20 more rows
  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = over_50k, fill = factor(occupation)))

  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = occupation, fill = factor(over_50k)))

  #relationship
  df_hold <- ovr50 %>%
    group_by(over_50k,relationship)%>%
    summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
  df_hold
## # A tibble: 12 × 3
## # Groups:   over_50k [2]
##    over_50k relationship   count
##       <int> <chr>          <int>
##  1        0 Husband        10870
##  2        0 Not-in-family  11307
##  3        0 Other-relative  1454
##  4        0 Own-child       7470
##  5        0 Unmarried       4816
##  6        0 Wife            1238
##  7        1 Husband         8846
##  8        1 Not-in-family   1276
##  9        1 Other-relative    52
## 10        1 Own-child        111
## 11        1 Unmarried        309
## 12        1 Wife            1093
  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = over_50k, fill = factor(relationship)))

  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = relationship, fill = factor(over_50k)))

  #race
  df_hold <- ovr50 %>%
    group_by(over_50k,race)%>%
    summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
  df_hold
## # A tibble: 10 × 3
## # Groups:   over_50k [2]
##    over_50k race               count
##       <int> <chr>              <int>
##  1        0 Amer-Indian-Eskimo   415
##  2        0 Asian-Pac-Islander  1110
##  3        0 Black               4119
##  4        0 Other                356
##  5        0 White              31155
##  6        1 Amer-Indian-Eskimo    55
##  7        1 Asian-Pac-Islander   409
##  8        1 Black                566
##  9        1 Other                 50
## 10        1 White              10607
  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = over_50k, fill = factor(race)))

  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = race, fill = factor(over_50k)))

  #sex
  df_hold <- ovr50 %>%
    group_by(over_50k,sex)%>%
    summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
  df_hold
## # A tibble: 4 × 3
## # Groups:   over_50k [2]
##   over_50k sex    count
##      <int> <chr>  <int>
## 1        0 Female 14423
## 2        0 Male   22732
## 3        1 Female  1769
## 4        1 Male    9918
  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = over_50k, fill = factor(sex)))

  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = sex, fill = factor(over_50k)))

  #country
  df_hold <- ovr50 %>%
    group_by(over_50k,country)%>%
    summarize(count=n())
## `summarise()` has grouped output by 'over_50k'. You can override using the `.groups` argument.
  df_hold
## # A tibble: 83 × 3
## # Groups:   over_50k [2]
##    over_50k country            count
##       <int> <chr>              <int>
##  1        0 ?                    637
##  2        0 Cambodia              19
##  3        0 Canada               119
##  4        0 China                 86
##  5        0 Columbia              81
##  6        0 Cuba                 104
##  7        0 Dominican-Republic    98
##  8        0 Ecuador               39
##  9        0 El-Salvador          144
## 10        0 England               80
## # … with 73 more rows
  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = over_50k, fill = factor(country)))

  ggplot(data = ovr50) +
    geom_bar(mapping = aes(x = country, fill = factor(over_50k)))

  #Next Continuous Visualizations (boxplots)
  #education_num, capital_gain, capital_loss, hours_week
  
  #education_num
  ggplot(ovr50,aes(x = education_num))+
    geom_histogram(aes(y=..density..), alpha=0.5)+
    labs(x= "Education Level Acheived", y = "Count", title="Education Level")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  ggplot(data=ovr50, aes(y=education_num, x = over_50k, group=over_50k))+
    geom_boxplot()+
    labs(y="Education Number", x= "Over 50k Income (1=yes)")+
    coord_flip()

  ggplot(ovr50, aes(x =education_num, fill = over_50k)) + 
    geom_bar(position = "stack")

    kruskal.test(education_num~over_50k, data=ovr50)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  education_num by over_50k
## Kruskal-Wallis chi-squared = 5248.6, df = 1, p-value < 2.2e-16
  # Kruskal-Wallis chi-squared = 5248.6, df = 1, p-value < 2.2e-16
  
  #capital gain
 ggplot(ovr50,aes(x = capital_gain))+
    geom_histogram(aes(y=..density..), alpha=0.5, binwidth = 100)+
    labs(x= "Capital Gain", y = "Count", title="Capital Gains")

  ggplot(data=ovr50, aes(y=capital_gain, x = over_50k, group=over_50k))+
    geom_boxplot()+
    labs(y="Capital Gains", x= "Over 50k Income (1=yes)")+
    coord_flip()

    #significantly left skewed!
      
  kruskal.test(capital_gain~over_50k, data=ovr50)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  capital_gain by over_50k
## Kruskal-Wallis chi-squared = 3767.4, df = 1, p-value < 2.2e-16
  # Kruskal-Wallis chi-squared = 3767.4, df = 1, p-value < 2.2e-16
  
  #capital loss
 ggplot(ovr50,aes(x = capital_loss))+
    geom_histogram(aes(y=..density..), alpha=0.5, binwidth = 10)+
    labs(x= "Capital Loss", y = "Count", title="Capital Loss")

  ggplot(data=ovr50, aes(y=capital_gain, x = over_50k, group=over_50k))+
    geom_boxplot()+
    labs(y="Capital Loss", x= "Over 50k Income (1=yes)")+
    coord_flip()

    #left skewed!
    
  kruskal.test(capital_loss~over_50k, data=ovr50)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  capital_loss by over_50k
## Kruskal-Wallis chi-squared = 933.48, df = 1, p-value < 2.2e-16
  # Kruskal-Wallis chi-squared = 933.48, df = 1, p-value < 2.2e-16
  
  
    #hours per week
 ggplot(ovr50,aes(x = hours_week))+
    geom_histogram(aes(y=..density..), alpha=0.5)+
    labs(x= "Hours Per Week", y = "Count", title="L")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  ggplot(data=ovr50, aes(y=hours_week, x = over_50k, group=over_50k))+
    geom_boxplot()+
    labs(y="Hours Per Week", x= "Over 50k Income (1=yes)")+
    coord_flip()

  kruskal.test(hours_week~over_50k, data=ovr50)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  hours_week by over_50k
## Kruskal-Wallis chi-squared = 3512.3, df = 1, p-value < 2.2e-16
  # Kruskal-Wallis chi-squared = 3512.3, df = 1, p-value < 2.2e-16

Splitting into test/validate/train

##      train   validate       test 
## 0.69999181 0.19999181 0.09999591

Building the model

#STEP 1: continuous variables need to meet assumptions OR be binned.

#checking linearity assumption on continuous variables with gams
#not using education_num since it represents the same information as education_level
fit.gam<-gam(over_50k ~factor(age) +factor(workclass) + factor(education_level) + factor(marital_status) + factor(occupation) + factor(relationship) + factor(race) + factor(sex) + s(capital_gain) + s(capital_loss) + s(hours_week) ,
data =train, family =binomial(link ='logit'), method ='REML')
summary(fit.gam)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## over_50k ~ factor(age) + factor(workclass) + factor(education_level) + 
##     factor(marital_status) + factor(occupation) + factor(relationship) + 
##     factor(race) + factor(sex) + s(capital_gain) + s(capital_loss) + 
##     s(hours_week)
## 
## Parametric coefficients:
##                                               Estimate Std. Error z value
## (Intercept)                                 -8.603e+01  3.329e+06   0.000
## factor(age)18                                1.021e+01  4.309e+06   0.000
## factor(age)19                                7.790e+01  3.329e+06   0.000
## factor(age)20                                7.764e+01  3.329e+06   0.000
## factor(age)21                                7.809e+01  3.329e+06   0.000
## factor(age)22                                7.847e+01  3.329e+06   0.000
## factor(age)23                                7.848e+01  3.329e+06   0.000
## factor(age)24                                7.904e+01  3.329e+06   0.000
## factor(age)25                                7.967e+01  3.329e+06   0.000
## factor(age)26                                7.954e+01  3.329e+06   0.000
## factor(age)27                                7.967e+01  3.329e+06   0.000
## factor(age)28                                7.997e+01  3.329e+06   0.000
## factor(age)29                                8.026e+01  3.329e+06   0.000
## factor(age)30                                8.025e+01  3.329e+06   0.000
## factor(age)31                                8.029e+01  3.329e+06   0.000
## factor(age)32                                8.043e+01  3.329e+06   0.000
## factor(age)33                                8.017e+01  3.329e+06   0.000
## factor(age)34                                8.062e+01  3.329e+06   0.000
## factor(age)35                                8.046e+01  3.329e+06   0.000
## factor(age)36                                8.056e+01  3.329e+06   0.000
## factor(age)37                                8.091e+01  3.329e+06   0.000
## factor(age)38                                8.074e+01  3.329e+06   0.000
## factor(age)39                                8.098e+01  3.329e+06   0.000
## factor(age)40                                8.085e+01  3.329e+06   0.000
## factor(age)41                                8.079e+01  3.329e+06   0.000
## factor(age)42                                8.086e+01  3.329e+06   0.000
## factor(age)43                                8.095e+01  3.329e+06   0.000
## factor(age)44                                8.092e+01  3.329e+06   0.000
## factor(age)45                                8.086e+01  3.329e+06   0.000
## factor(age)46                                8.111e+01  3.329e+06   0.000
## factor(age)47                                8.114e+01  3.329e+06   0.000
## factor(age)48                                8.131e+01  3.329e+06   0.000
## factor(age)49                                8.103e+01  3.329e+06   0.000
## factor(age)50                                8.118e+01  3.329e+06   0.000
## factor(age)51                                8.116e+01  3.329e+06   0.000
## factor(age)52                                8.104e+01  3.329e+06   0.000
## factor(age)53                                8.110e+01  3.329e+06   0.000
## factor(age)54                                8.112e+01  3.329e+06   0.000
## factor(age)55                                8.085e+01  3.329e+06   0.000
## factor(age)56                                8.097e+01  3.329e+06   0.000
## factor(age)57                                8.105e+01  3.329e+06   0.000
## factor(age)58                                8.104e+01  3.329e+06   0.000
## factor(age)59                                8.103e+01  3.329e+06   0.000
## factor(age)60                                8.091e+01  3.329e+06   0.000
## factor(age)61                                8.109e+01  3.329e+06   0.000
## factor(age)62                                8.032e+01  3.329e+06   0.000
## factor(age)63                                8.086e+01  3.329e+06   0.000
## factor(age)64                                8.096e+01  3.329e+06   0.000
## factor(age)65                                8.041e+01  3.329e+06   0.000
## factor(age)66                                8.082e+01  3.329e+06   0.000
## factor(age)67                                8.027e+01  3.329e+06   0.000
## factor(age)68                                8.060e+01  3.329e+06   0.000
## factor(age)69                                8.037e+01  3.329e+06   0.000
## factor(age)70                                8.027e+01  3.329e+06   0.000
## factor(age)71                                8.051e+01  3.329e+06   0.000
## factor(age)72                                7.983e+01  3.329e+06   0.000
## factor(age)73                                8.091e+01  3.329e+06   0.000
## factor(age)74                                8.134e+01  3.329e+06   0.000
## factor(age)75                                8.078e+01  3.329e+06   0.000
## factor(age)76                                8.038e+01  3.329e+06   0.000
## factor(age)77                                8.007e+01  3.329e+06   0.000
## factor(age)78                                8.031e+01  3.329e+06   0.000
## factor(age)79                                8.164e+01  3.329e+06   0.000
## factor(age)80                                7.910e+01  3.329e+06   0.000
## factor(age)81                                8.150e+01  3.329e+06   0.000
## factor(age)82                                2.858e+00  1.891e+07   0.000
## factor(age)83                                8.010e+01  3.329e+06   0.000
## factor(age)84                                8.084e+01  3.329e+06   0.000
## factor(age)85                                8.058e+01  3.329e+06   0.000
## factor(age)86                                9.310e-01  6.719e+07   0.000
## factor(age)87                                2.347e+00  3.889e+07   0.000
## factor(age)88                                7.890e+01  3.329e+06   0.000
## factor(age)89                               -9.332e-01  4.757e+07   0.000
## factor(age)90                                7.994e+01  3.329e+06   0.000
## factor(workclass)Federal-gov                 4.644e-01  1.254e+00   0.370
## factor(workclass)Local-gov                  -2.372e-01  1.260e+00  -0.188
## factor(workclass)Never-worked               -4.740e+01  2.376e+07   0.000
## factor(workclass)Private                     3.256e-02  1.258e+00   0.026
## factor(workclass)Self-emp-inc                2.299e-01  1.260e+00   0.182
## factor(workclass)Self-emp-not-inc           -4.401e-01  1.259e+00  -0.350
## factor(workclass)State-gov                  -2.818e-01  1.261e+00  -0.224
## factor(workclass)Without-pay                 4.412e-01  1.537e+00   0.287
## factor(education_level)11th                  4.081e-02  2.167e-01   0.188
## factor(education_level)12th                  4.379e-01  2.612e-01   1.676
## factor(education_level)1st-4th              -8.125e-01  4.670e-01  -1.740
## factor(education_level)5th-6th              -3.088e-01  3.024e-01  -1.021
## factor(education_level)7th-8th              -4.550e-01  2.342e-01  -1.943
## factor(education_level)9th                  -1.482e-01  2.574e-01  -0.576
## factor(education_level)Assoc-acdm            1.388e+00  1.790e-01   7.752
## factor(education_level)Assoc-voc             1.274e+00  1.726e-01   7.380
## factor(education_level)Bachelors             1.937e+00  1.607e-01  12.051
## factor(education_level)Doctorate             2.799e+00  2.162e-01  12.945
## factor(education_level)HS-grad               8.527e-01  1.565e-01   5.448
## factor(education_level)Masters               2.163e+00  1.707e-01  12.670
## factor(education_level)Preschool            -3.684e+01  8.812e+06   0.000
## factor(education_level)Prof-school           2.886e+00  2.058e-01  14.019
## factor(education_level)Some-college          1.214e+00  1.589e-01   7.637
## factor(marital_status)Married-AF-spouse      2.502e+00  5.949e-01   4.206
## factor(marital_status)Married-civ-spouse     2.227e+00  2.698e-01   8.253
## factor(marital_status)Married-spouse-absent  1.816e-01  2.239e-01   0.811
## factor(marital_status)Never-married         -1.329e-01  9.011e-02  -1.475
## factor(marital_status)Separated              4.114e-02  1.633e-01   0.252
## factor(marital_status)Widowed                4.982e-01  1.571e-01   3.171
## factor(occupation)Adm-clerical               2.371e-01  1.254e+00   0.189
## factor(occupation)Armed-Forces               0.000e+00  0.000e+00     NaN
## factor(occupation)Craft-repair               2.709e-01  1.254e+00   0.216
## factor(occupation)Exec-managerial            9.681e-01  1.254e+00   0.772
## factor(occupation)Farming-fishing           -5.516e-01  1.259e+00  -0.438
## factor(occupation)Handlers-cleaners         -3.619e-01  1.259e+00  -0.287
## factor(occupation)Machine-op-inspct         -6.509e-02  1.256e+00  -0.052
## factor(occupation)Other-service             -6.882e-01  1.257e+00  -0.547
## factor(occupation)Priv-house-serv           -1.615e+00  1.625e+00  -0.994
## factor(occupation)Prof-specialty             7.282e-01  1.254e+00   0.581
## factor(occupation)Protective-serv            8.467e-01  1.258e+00   0.673
## factor(occupation)Sales                      5.380e-01  1.254e+00   0.429
## factor(occupation)Tech-support               8.326e-01  1.256e+00   0.663
## factor(occupation)Transport-moving           2.231e-01  1.255e+00   0.178
## factor(relationship)Not-in-family            3.401e-01  2.677e-01   1.270
## factor(relationship)Other-relative          -4.755e-01  2.517e-01  -1.889
## factor(relationship)Own-child               -4.202e-01  2.634e-01  -1.595
## factor(relationship)Unmarried                2.025e-02  2.849e-01   0.071
## factor(relationship)Wife                     1.212e+00  1.040e-01  11.659
## factor(race)Asian-Pac-Islander               4.456e-01  2.421e-01   1.840
## factor(race)Black                            2.212e-01  2.321e-01   0.953
## factor(race)Other                            3.688e-01  3.288e-01   1.121
## factor(race)White                            5.934e-01  2.210e-01   2.686
## factor(sex)Male                              6.582e-01  8.088e-02   8.138
##                                             Pr(>|z|)    
## (Intercept)                                  0.99998    
## factor(age)18                                1.00000    
## factor(age)19                                0.99998    
## factor(age)20                                0.99998    
## factor(age)21                                0.99998    
## factor(age)22                                0.99998    
## factor(age)23                                0.99998    
## factor(age)24                                0.99998    
## factor(age)25                                0.99998    
## factor(age)26                                0.99998    
## factor(age)27                                0.99998    
## factor(age)28                                0.99998    
## factor(age)29                                0.99998    
## factor(age)30                                0.99998    
## factor(age)31                                0.99998    
## factor(age)32                                0.99998    
## factor(age)33                                0.99998    
## factor(age)34                                0.99998    
## factor(age)35                                0.99998    
## factor(age)36                                0.99998    
## factor(age)37                                0.99998    
## factor(age)38                                0.99998    
## factor(age)39                                0.99998    
## factor(age)40                                0.99998    
## factor(age)41                                0.99998    
## factor(age)42                                0.99998    
## factor(age)43                                0.99998    
## factor(age)44                                0.99998    
## factor(age)45                                0.99998    
## factor(age)46                                0.99998    
## factor(age)47                                0.99998    
## factor(age)48                                0.99998    
## factor(age)49                                0.99998    
## factor(age)50                                0.99998    
## factor(age)51                                0.99998    
## factor(age)52                                0.99998    
## factor(age)53                                0.99998    
## factor(age)54                                0.99998    
## factor(age)55                                0.99998    
## factor(age)56                                0.99998    
## factor(age)57                                0.99998    
## factor(age)58                                0.99998    
## factor(age)59                                0.99998    
## factor(age)60                                0.99998    
## factor(age)61                                0.99998    
## factor(age)62                                0.99998    
## factor(age)63                                0.99998    
## factor(age)64                                0.99998    
## factor(age)65                                0.99998    
## factor(age)66                                0.99998    
## factor(age)67                                0.99998    
## factor(age)68                                0.99998    
## factor(age)69                                0.99998    
## factor(age)70                                0.99998    
## factor(age)71                                0.99998    
## factor(age)72                                0.99998    
## factor(age)73                                0.99998    
## factor(age)74                                0.99998    
## factor(age)75                                0.99998    
## factor(age)76                                0.99998    
## factor(age)77                                0.99998    
## factor(age)78                                0.99998    
## factor(age)79                                0.99998    
## factor(age)80                                0.99998    
## factor(age)81                                0.99998    
## factor(age)82                                1.00000    
## factor(age)83                                0.99998    
## factor(age)84                                0.99998    
## factor(age)85                                0.99998    
## factor(age)86                                1.00000    
## factor(age)87                                1.00000    
## factor(age)88                                0.99998    
## factor(age)89                                1.00000    
## factor(age)90                                0.99998    
## factor(workclass)Federal-gov                 0.71123    
## factor(workclass)Local-gov                   0.85065    
## factor(workclass)Never-worked                1.00000    
## factor(workclass)Private                     0.97935    
## factor(workclass)Self-emp-inc                0.85525    
## factor(workclass)Self-emp-not-inc            0.72666    
## factor(workclass)State-gov                   0.82311    
## factor(workclass)Without-pay                 0.77409    
## factor(education_level)11th                  0.85058    
## factor(education_level)12th                  0.09368 .  
## factor(education_level)1st-4th               0.08189 .  
## factor(education_level)5th-6th               0.30713    
## factor(education_level)7th-8th               0.05200 .  
## factor(education_level)9th                   0.56470    
## factor(education_level)Assoc-acdm           9.04e-15 ***
## factor(education_level)Assoc-voc            1.58e-13 ***
## factor(education_level)Bachelors             < 2e-16 ***
## factor(education_level)Doctorate             < 2e-16 ***
## factor(education_level)HS-grad              5.08e-08 ***
## factor(education_level)Masters               < 2e-16 ***
## factor(education_level)Preschool             1.00000    
## factor(education_level)Prof-school           < 2e-16 ***
## factor(education_level)Some-college         2.22e-14 ***
## factor(marital_status)Married-AF-spouse     2.60e-05 ***
## factor(marital_status)Married-civ-spouse     < 2e-16 ***
## factor(marital_status)Married-spouse-absent  0.41730    
## factor(marital_status)Never-married          0.14015    
## factor(marital_status)Separated              0.80109    
## factor(marital_status)Widowed                0.00152 ** 
## factor(occupation)Adm-clerical               0.85002    
## factor(occupation)Armed-Forces                   NaN    
## factor(occupation)Craft-repair               0.82897    
## factor(occupation)Exec-managerial            0.44001    
## factor(occupation)Farming-fishing            0.66136    
## factor(occupation)Handlers-cleaners          0.77376    
## factor(occupation)Machine-op-inspct          0.95866    
## factor(occupation)Other-service              0.58418    
## factor(occupation)Priv-house-serv            0.32025    
## factor(occupation)Prof-specialty             0.56139    
## factor(occupation)Protective-serv            0.50100    
## factor(occupation)Sales                      0.66802    
## factor(occupation)Tech-support               0.50747    
## factor(occupation)Transport-moving           0.85897    
## factor(relationship)Not-in-family            0.20394    
## factor(relationship)Other-relative           0.05885 .  
## factor(relationship)Own-child                0.11061    
## factor(relationship)Unmarried                0.94334    
## factor(relationship)Wife                     < 2e-16 ***
## factor(race)Asian-Pac-Islander               0.06571 .  
## factor(race)Black                            0.34055    
## factor(race)Other                            0.26208    
## factor(race)White                            0.00723 ** 
## factor(sex)Male                             4.01e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df Chi.sq p-value    
## s(capital_gain) 7.054  7.768  635.0  <2e-16 ***
## s(capital_loss) 7.756  8.483  321.7  <2e-16 ***
## s(hours_week)   6.091  7.141  287.1  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Rank: 153/154
## R-sq.(adj) =  0.473   Deviance explained = 45.8%
## -REML =  10179  Scale est. = 1         n = 34189
plot(fit.gam)

#All continuous vars do not meet assumptions (edf of 7 capital_gain, 8 capital_loss, and 6 hours_week)
#therefore I will bin capital_gain and capital_loss and hours_week

#because of the skewness seen above I will bin into 0 and 1 for capital_gain and capital_loss
train$capital_gain <- ifelse(train$capital_gain == 0, 0, 1)
train$capital_loss <- ifelse(train$capital_loss == 0, 0, 1)


#since median is 40 hours per week, I will bin into 'Less_40' = 1, '40' = 0, and 'Greater_40' = 2
#I had trouble and couldn't figure out quick enough how to fix my error so the next part is a roundabout way of binning.
train$hours_week = ifelse(train$hours_week < 40, 1, train$hours_week)
train$hours_week_bin = train$hours_week
train$hours_week_bin = ifelse(train$hours_week_bin > 40, 2, train$hours_week_bin)
train$hours_week_bins = train$hours_week_bin
train$hours_week_bins = ifelse(train$hours_week_bins == 40, 0, train$hours_week_bins)

#View(train)
#now check for separation:
#CHECK FOR SEPARATION in categorical vars
#put all categorical variables in their own data frame to run through separation loop
cat_var <- train %>% 
  dplyr::select(over_50k,
                workclass, 
                education_level,
                marital_status,
                occupation,
                relationship,
                race,
                sex,
                country,
                age,
                capital_gain,
                capital_loss,
                hours_week_bins)

#Loop to look for separation:
for (i in 1:length(colnames(cat_var)) ) {
  print( colnames(cat_var)[i] )
  print( table(cat_var$over_50k, cat_var[,i])  )
}
## [1] "over_50k"
##    
##         0     1
##   0 26043     0
##   1     0  8146
## [1] "workclass"
##    
##         ? Federal-gov Local-gov Never-worked Private Self-emp-inc
##   0  1813         603      1526            8   18631          532
##   1   189         386       598            0    5184          657
##    
##     Self-emp-not-inc State-gov Without-pay
##   0             1930       993           7
##   1              757       373           2
## [1] "education_level"
##    
##     10th 11th 12th 1st-4th 5th-6th 7th-8th  9th Assoc-acdm Assoc-voc Bachelors
##   0  910 1227  433     161     335     660  505        832      1077      3300
##   1   58   62   35       7      20      40   29        291       358      2311
##    
##     Doctorate HS-grad Masters Preschool Prof-school Some-college
##   0       109    9238     840        58         149         6209
##   1       303    1760    1002         0         437         1433
## [1] "marital_status"
##    
##     Divorced Married-AF-spouse Married-civ-spouse Married-spouse-absent
##   0     4159                15               8606                   400
##   1      459                 8               6980                    38
##    
##     Never-married Separated Widowed
##   0         10858      1008     997
##   1           500        71      90
## [1] "occupation"
##    
##        ? Adm-clerical Armed-Forces Craft-repair Exec-managerial Farming-fishing
##   0 1821         3393            8         3342            2204             936
##   1  189          538            3          992            2029             121
##    
##     Handlers-cleaners Machine-op-inspct Other-service Priv-house-serv
##   0              1371              1847          3305             170
##   1               103               260           138               1
##    
##     Prof-specialty Protective-serv Sales Tech-support Transport-moving
##   0           2396             451  2774          746             1279
##   1           1904             198  1030          308              332
## [1] "relationship"
##    
##     Husband Not-in-family Other-relative Own-child Unmarried Wife
##   0    7518          7884           1028      5343      3395  875
##   1    6177           873             34        77       210  775
## [1] "race"
##    
##     Amer-Indian-Eskimo Asian-Pac-Islander Black Other White
##   0                307                786  2947   256 21747
##   1                 40                278   376    34  7418
## [1] "sex"
##    
##     Female  Male
##   0  10100 15943
##   1   1235  6911
## [1] "country"
##    
##         ? Cambodia Canada China Columbia  Cuba Dominican-Republic Ecuador
##   0   445       11     90    58       62    64                 66      32
##   1   167        8     46    25        2    25                  4       6
##    
##     El-Salvador England France Germany Greece Guatemala Haiti
##   0         100      52     12     104     24        57    44
##   1           6      32     12      49     12         2     7
##    
##     Holand-Netherlands Honduras  Hong Hungary India  Iran Ireland Italy Jamaica
##   0                  1       15    19       9    65    26      23    49      74
##   1                  0        0     4       5    42    17       6    24      13
##    
##     Japan  Laos Mexico Nicaragua Outlying-US(Guam-USVI-etc)  Peru Philippines
##   0    37    13    639        30                         15    25         144
##   1    22     0     30         1                          1     2          56
##    
##     Poland Portugal Puerto-Rico Scotland South Taiwan Thailand Trinadad&Tobago
##   0     47       40         119       13    63     27       16              19
##   1     12        9          14        3    14     21        1               2
##    
##     United-States Vietnam Yugoslavia
##   0         23228      57          9
##   1          7434       5          5
## [1] "age"
##    
##      17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
##   0 407 600 746 801 788 831 943 824 797 742 775 765 708 717 723 682 744 693 700
##   1   0   0   1   1   5  11  14  33  57  62  79 123 146 163 186 219 196 242 248
##    
##      36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
##   0 680 587 577 542 571 548 514 472 448 499 474 446 359 376 331 367 310 308 265
##   1 273 292 286 303 285 291 285 287 278 275 320 294 250 226 254 239 183 195 169
##    
##      55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72  73
##   0 292 258 235 249 249 213 198 199 162 163 145 129 123 106  89  79  58  68  63
##   1 143 133 140 131 133  90 100  65  65  66  51  41  39  30  26  19  16   9  12
##    
##      74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
##   0  40  43  41  36  17  13  23  18  13   4   9   4   1   3   5   2  33
##   1  12  10   5   3   3   9   3   7   0   1   1   1   0   0   1   0  10
## [1] "capital_gain"
##    
##         0     1
##   0 24958  1085
##   1  6416  1730
## [1] "capital_loss"
##    
##         0     1
##   0 25238   805
##   1  7360   786
## [1] "hours_week_bins"
##    
##         0     1     2
##   0 12567  7506  5970
##   1  3376   794  3976
#separation in age (bin to 0-24, 25-44, 45-64, 65+) per what the census does on their website
train <- train%>%mutate(age_bin = cut(age, breaks = c(0,24,44,64,140)))
head(train,10)
##    id age workclass education_level education_num        marital_status
## 1   1  39 State-gov       Bachelors            13         Never-married
## 3   3  38   Private         HS-grad             9              Divorced
## 5   5  28   Private       Bachelors            13    Married-civ-spouse
## 6   6  37   Private         Masters            14    Married-civ-spouse
## 7   7  49   Private             9th             5 Married-spouse-absent
## 9   9  31   Private         Masters            14         Never-married
## 13 13  23   Private       Bachelors            13         Never-married
## 14 14  32   Private      Assoc-acdm            12         Never-married
## 15 15  40   Private       Assoc-voc            11    Married-civ-spouse
## 16 16  34   Private         7th-8th             4    Married-civ-spouse
##           occupation  relationship               race    sex capital_gain
## 1       Adm-clerical Not-in-family              White   Male            1
## 3  Handlers-cleaners Not-in-family              White   Male            0
## 5     Prof-specialty          Wife              Black Female            0
## 6    Exec-managerial          Wife              White Female            0
## 7      Other-service Not-in-family              Black Female            0
## 9     Prof-specialty Not-in-family              White Female            1
## 13      Adm-clerical     Own-child              White Female            0
## 14             Sales Not-in-family              Black   Male            0
## 15      Craft-repair       Husband Asian-Pac-Islander   Male            0
## 16  Transport-moving       Husband Amer-Indian-Eskimo   Male            0
##    capital_loss hours_week       country over_50k hours_week_bin
## 1             0         40 United-States        0             40
## 3             0         40 United-States        0             40
## 5             0         40          Cuba        0             40
## 6             0         40 United-States        0             40
## 7             0          1       Jamaica        0              1
## 9             0         50 United-States        1              2
## 13            0          1 United-States        0              1
## 14            0         50 United-States        0              2
## 15            0         40             ?        1             40
## 16            0         45        Mexico        0              2
##    hours_week_bins age_bin
## 1                0 (24,44]
## 3                0 (24,44]
## 5                0 (24,44]
## 6                0 (24,44]
## 7                1 (44,64]
## 9                2 (24,44]
## 13               1  (0,24]
## 14               2 (24,44]
## 15               0 (24,44]
## 16               2 (24,44]
#separation in countries (Honduras, Holand_Netherlands, Laos will be binned into ?)
train$country<- ifelse(train$country == 'Holand-Netherlands'| train$country == 'Honduras' |train$country == 'Loas', '?', train$country)
#separation in education_level (bin preschool into 1st-4th)
train$education_level<- ifelse(train$education_level == 'Preschool', '1st-4th', train$education_level)
#separation in workclass (bin Never-worked into ?)
train$workclass<- ifelse(train$workclass == 'Never-worked', '?', train$workclass)


#Releveling Education_level
train$education_level <- factor(train$education_level, levels = c("HS-grad", "Preschool", "1st-4th", "5th-6th","7th-8th","9th","10th","11th","12th","Some-college","Assoc-voc","Assoc-acdm","Bachelors","Masters","Prof-school","Doctorate"))
train %>% count(education_level, sort = TRUE)
##    education_level     n
## 1          HS-grad 10998
## 2     Some-college  7642
## 3        Bachelors  5611
## 4          Masters  1842
## 5        Assoc-voc  1435
## 6             11th  1289
## 7       Assoc-acdm  1123
## 8             10th   968
## 9          7th-8th   700
## 10     Prof-school   586
## 11             9th   534
## 12            12th   468
## 13       Doctorate   412
## 14         5th-6th   355
## 15         1st-4th   226

Building the model

#STEP 2: variable selection using backwards selection

#now backwards selection with main effects
#then forward selection with interactions (be careful of interactions with separation)

full.model <- glm(over_50k ~factor(age_bin) +factor(workclass) + factor(education_level) + factor(marital_status) + factor(occupation) + factor(relationship) + factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins) ,
data =train, family =binomial(link ='logit'))

back.model<-step(full.model, direction ="backward")
## Start:  AIC=22403.58
## over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) + 
##     factor(marital_status) + factor(occupation) + factor(relationship) + 
##     factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + 
##     factor(hours_week_bins)
## 
##                           Df Deviance   AIC
## <none>                          22288 22404
## - factor(race)             4    22312 22420
## - factor(sex)              1    22368 22482
## - factor(marital_status)   6    22389 22493
## - factor(workclass)        6    22413 22517
## - factor(relationship)     5    22515 22621
## - factor(capital_loss)     1    22525 22639
## - factor(hours_week_bins)  2    22612 22724
## - factor(age_bin)          3    22750 22860
## - factor(occupation)      13    22861 22951
## - factor(capital_gain)     1    23306 23420
## - factor(education_level) 14    23468 23556
# Start:  AIC=22403.58
# over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) + 
#     factor(marital_status) + factor(occupation) + factor(relationship) + 
#     factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + 
#     factor(hours_week_bins)

back.model1<-step(full.model, direction ="backward", k= qchisq(0.02, 1,lower.tail=FALSE))
## Start:  AIC=22601.47
## over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) + 
##     factor(marital_status) + factor(occupation) + factor(relationship) + 
##     factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + 
##     factor(hours_week_bins)
## 
##                           Df Deviance   AIC
## <none>                          22288 22602
## - factor(race)             4    22312 22604
## - factor(marital_status)   6    22389 22671
## - factor(sex)              1    22368 22676
## - factor(workclass)        6    22413 22694
## - factor(relationship)     5    22515 22802
## - factor(capital_loss)     1    22525 22833
## - factor(hours_week_bins)  2    22612 22915
## - factor(age_bin)          3    22750 23048
## - factor(occupation)      13    22861 23105
## - factor(capital_gain)     1    23306 23614
## - factor(education_level) 14    23468 23706
summary(back.model)
## 
## Call:
## glm(formula = over_50k ~ factor(age_bin) + factor(workclass) + 
##     factor(education_level) + factor(marital_status) + factor(occupation) + 
##     factor(relationship) + factor(race) + factor(sex) + factor(capital_gain) + 
##     factor(capital_loss) + factor(hours_week_bins), family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7498  -0.5226  -0.1811  -0.0275   4.0001  
## 
## Coefficients: (1 not defined because of singularities)
##                                              Estimate Std. Error z value
## (Intercept)                                 -6.448261   0.371269 -17.368
## factor(age_bin)(24,44]                       1.656461   0.137234  12.070
## factor(age_bin)(44,64]                       2.170419   0.139412  15.568
## factor(age_bin)(64,140]                      1.579797   0.162497   9.722
## factor(workclass)Federal-gov                 0.899810   0.150766   5.968
## factor(workclass)Local-gov                   0.171875   0.137616   1.249
## factor(workclass)Private                     0.429260   0.121779   3.525
## factor(workclass)Self-emp-inc                0.753491   0.143522   5.250
## factor(workclass)Self-emp-not-inc            0.016950   0.132230   0.128
## factor(workclass)State-gov                   0.141722   0.146792   0.965
## factor(workclass)Without-pay                 0.694739   0.891174   0.780
## factor(education_level)1st-4th              -1.613906   0.404710  -3.988
## factor(education_level)5th-6th              -1.114677   0.252057  -4.422
## factor(education_level)7th-8th              -1.412574   0.177856  -7.942
## factor(education_level)9th                  -1.157515   0.208327  -5.556
## factor(education_level)10th                 -0.873925   0.151609  -5.764
## factor(education_level)11th                 -0.773510   0.147152  -5.257
## factor(education_level)12th                 -0.454428   0.204155  -2.226
## factor(education_level)Some-college          0.356250   0.048689   7.317
## factor(education_level)Assoc-voc             0.446368   0.080946   5.514
## factor(education_level)Assoc-acdm            0.569394   0.091937   6.193
## factor(education_level)Bachelors             1.103616   0.052001  21.223
## factor(education_level)Masters               1.407614   0.074980  18.773
## factor(education_level)Prof-school           2.116841   0.129627  16.330
## factor(education_level)Doctorate             2.069027   0.146446  14.128
## factor(marital_status)Married-AF-spouse      1.926826   0.587763   3.278
## factor(marital_status)Married-civ-spouse     1.988704   0.252131   7.888
## factor(marital_status)Married-spouse-absent  0.125819   0.204445   0.615
## factor(marital_status)Never-married         -0.326844   0.080815  -4.044
## factor(marital_status)Separated              0.003239   0.150225   0.022
## factor(marital_status)Widowed                0.465757   0.139360   3.342
## factor(occupation)Adm-clerical               0.031466   0.095653   0.329
## factor(occupation)Armed-Forces               0.048946   1.196174   0.041
## factor(occupation)Craft-repair               0.051737   0.082067   0.630
## factor(occupation)Exec-managerial            0.774225   0.084377   9.176
## factor(occupation)Farming-fishing           -0.743849   0.133872  -5.556
## factor(occupation)Handlers-cleaners         -0.598774   0.135004  -4.435
## factor(occupation)Machine-op-inspct         -0.388406   0.103214  -3.763
## factor(occupation)Other-service             -0.875708   0.120704  -7.255
## factor(occupation)Priv-house-serv           -1.905857   1.021947  -1.865
## factor(occupation)Prof-specialty             0.515972   0.090281   5.715
## factor(occupation)Protective-serv            0.565573   0.130391   4.338
## factor(occupation)Sales                      0.300460   0.087461   3.435
## factor(occupation)Tech-support               0.597144   0.114160   5.231
## factor(occupation)Transport-moving                 NA         NA      NA
## factor(relationship)Not-in-family            0.244824   0.250096   0.979
## factor(relationship)Other-relative          -0.562082   0.238346  -2.358
## factor(relationship)Own-child               -0.588972   0.247459  -2.380
## factor(relationship)Unmarried               -0.039902   0.265146  -0.150
## factor(relationship)Wife                     1.132829   0.096031  11.796
## factor(race)Asian-Pac-Islander               0.318531   0.224299   1.420
## factor(race)Black                            0.199789   0.213800   0.934
## factor(race)Other                            0.234704   0.304568   0.771
## factor(race)White                            0.482096   0.203218   2.372
## factor(sex)Male                              0.641961   0.072498   8.855
## factor(capital_gain)1                        1.690866   0.054822  30.843
## factor(capital_loss)1                        1.053357   0.068661  15.341
## factor(hours_week_bins)1                    -0.577007   0.057304 -10.069
## factor(hours_week_bins)2                     0.425491   0.037839  11.245
##                                             Pr(>|z|)    
## (Intercept)                                  < 2e-16 ***
## factor(age_bin)(24,44]                       < 2e-16 ***
## factor(age_bin)(44,64]                       < 2e-16 ***
## factor(age_bin)(64,140]                      < 2e-16 ***
## factor(workclass)Federal-gov                2.40e-09 ***
## factor(workclass)Local-gov                  0.211685    
## factor(workclass)Private                    0.000424 ***
## factor(workclass)Self-emp-inc               1.52e-07 ***
## factor(workclass)Self-emp-not-inc           0.898005    
## factor(workclass)State-gov                  0.334314    
## factor(workclass)Without-pay                0.435640    
## factor(education_level)1st-4th              6.67e-05 ***
## factor(education_level)5th-6th              9.76e-06 ***
## factor(education_level)7th-8th              1.99e-15 ***
## factor(education_level)9th                  2.76e-08 ***
## factor(education_level)10th                 8.20e-09 ***
## factor(education_level)11th                 1.47e-07 ***
## factor(education_level)12th                 0.026021 *  
## factor(education_level)Some-college         2.54e-13 ***
## factor(education_level)Assoc-voc            3.50e-08 ***
## factor(education_level)Assoc-acdm           5.89e-10 ***
## factor(education_level)Bachelors             < 2e-16 ***
## factor(education_level)Masters               < 2e-16 ***
## factor(education_level)Prof-school           < 2e-16 ***
## factor(education_level)Doctorate             < 2e-16 ***
## factor(marital_status)Married-AF-spouse     0.001045 ** 
## factor(marital_status)Married-civ-spouse    3.08e-15 ***
## factor(marital_status)Married-spouse-absent 0.538280    
## factor(marital_status)Never-married         5.25e-05 ***
## factor(marital_status)Separated             0.982798    
## factor(marital_status)Widowed               0.000831 ***
## factor(occupation)Adm-clerical              0.742185    
## factor(occupation)Armed-Forces              0.967361    
## factor(occupation)Craft-repair              0.528416    
## factor(occupation)Exec-managerial            < 2e-16 ***
## factor(occupation)Farming-fishing           2.75e-08 ***
## factor(occupation)Handlers-cleaners         9.20e-06 ***
## factor(occupation)Machine-op-inspct         0.000168 ***
## factor(occupation)Other-service             4.02e-13 ***
## factor(occupation)Priv-house-serv           0.062192 .  
## factor(occupation)Prof-specialty            1.10e-08 ***
## factor(occupation)Protective-serv           1.44e-05 ***
## factor(occupation)Sales                     0.000592 ***
## factor(occupation)Tech-support              1.69e-07 ***
## factor(occupation)Transport-moving                NA    
## factor(relationship)Not-in-family           0.327620    
## factor(relationship)Other-relative          0.018361 *  
## factor(relationship)Own-child               0.017309 *  
## factor(relationship)Unmarried               0.880377    
## factor(relationship)Wife                     < 2e-16 ***
## factor(race)Asian-Pac-Islander              0.155573    
## factor(race)Black                           0.350062    
## factor(race)Other                           0.440936    
## factor(race)White                           0.017677 *  
## factor(sex)Male                              < 2e-16 ***
## factor(capital_gain)1                        < 2e-16 ***
## factor(capital_loss)1                        < 2e-16 ***
## factor(hours_week_bins)1                     < 2e-16 ***
## factor(hours_week_bins)2                     < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37544  on 34188  degrees of freedom
## Residual deviance: 22288  on 34131  degrees of freedom
## AIC: 22404
## 
## Number of Fisher Scoring iterations: 7
library(car)
## Loading required package: carData
## 
## Attaching package: 'carData'
## The following object is masked from 'package:vcdExtra':
## 
##     Burt
## 
## Attaching package: 'car'
## The following object is masked from 'package:DescTools':
## 
##     Recode
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
#car::vif(back.model)
#tells me there is multicollinearity
#alias(back.model)
#tells me there is multicollinearity with occupation and workclass

#will drop workclass
full.model2 <- glm(over_50k ~factor(age_bin) + factor(education_level) + factor(marital_status) + factor(occupation) + factor(relationship) + factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins) ,
data =train, family =binomial(link ='logit'))

back.model2<-step(full.model2, direction ="backward")
## Start:  AIC=22516.96
## over_50k ~ factor(age_bin) + factor(education_level) + factor(marital_status) + 
##     factor(occupation) + factor(relationship) + factor(race) + 
##     factor(sex) + factor(capital_gain) + factor(capital_loss) + 
##     factor(hours_week_bins)
## 
##                           Df Deviance   AIC
## <none>                          22413 22517
## - factor(race)             4    22435 22531
## - factor(sex)              1    22497 22599
## - factor(marital_status)   6    22512 22604
## - factor(relationship)     5    22641 22735
## - factor(capital_loss)     1    22652 22754
## - factor(hours_week_bins)  2    22761 22861
## - factor(age_bin)          3    22870 22968
## - factor(occupation)      14    23071 23147
## - factor(capital_gain)     1    23444 23546
## - factor(education_level) 14    23583 23659
summary(back.model2)
## 
## Call:
## glm(formula = over_50k ~ factor(age_bin) + factor(education_level) + 
##     factor(marital_status) + factor(occupation) + factor(relationship) + 
##     factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + 
##     factor(hours_week_bins), family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7361  -0.5249  -0.1825  -0.0272   4.0102  
## 
## Coefficients:
##                                              Estimate Std. Error z value
## (Intercept)                                 -6.397067   0.370176 -17.281
## factor(age_bin)(24,44]                       1.644923   0.136980  12.008
## factor(age_bin)(44,64]                       2.148011   0.139005  15.453
## factor(age_bin)(64,140]                      1.555307   0.161838   9.610
## factor(education_level)1st-4th              -1.589103   0.407602  -3.899
## factor(education_level)5th-6th              -1.121447   0.252376  -4.444
## factor(education_level)7th-8th              -1.432489   0.177394  -8.075
## factor(education_level)9th                  -1.162270   0.208339  -5.579
## factor(education_level)10th                 -0.874861   0.151180  -5.787
## factor(education_level)11th                 -0.763288   0.146855  -5.198
## factor(education_level)12th                 -0.444969   0.203832  -2.183
## factor(education_level)Some-college          0.362922   0.048516   7.481
## factor(education_level)Assoc-voc             0.449157   0.080662   5.568
## factor(education_level)Assoc-acdm            0.578885   0.091451   6.330
## factor(education_level)Bachelors             1.096214   0.051753  21.181
## factor(education_level)Masters               1.366347   0.074043  18.454
## factor(education_level)Prof-school           2.090020   0.128861  16.219
## factor(education_level)Doctorate             2.022804   0.145095  13.941
## factor(marital_status)Married-AF-spouse      1.941904   0.590947   3.286
## factor(marital_status)Married-civ-spouse     1.973955   0.251657   7.844
## factor(marital_status)Married-spouse-absent  0.120495   0.204064   0.590
## factor(marital_status)Never-married         -0.317581   0.080559  -3.942
## factor(marital_status)Separated              0.006861   0.149773   0.046
## factor(marital_status)Widowed                0.467052   0.139198   3.355
## factor(occupation)Adm-clerical               0.467028   0.112744   4.142
## factor(occupation)Armed-Forces               0.919735   1.184404   0.777
## factor(occupation)Craft-repair               0.400925   0.106936   3.749
## factor(occupation)Exec-managerial            1.163380   0.104973  11.083
## factor(occupation)Farming-fishing           -0.551125   0.146494  -3.762
## factor(occupation)Handlers-cleaners         -0.211337   0.151050  -1.399
## factor(occupation)Machine-op-inspct         -0.002647   0.123661  -0.021
## factor(occupation)Other-service             -0.524559   0.136107  -3.854
## factor(occupation)Priv-house-serv           -1.473875   1.023143  -1.441
## factor(occupation)Prof-specialty             0.858205   0.107348   7.995
## factor(occupation)Protective-serv            0.804798   0.141771   5.677
## factor(occupation)Sales                      0.681602   0.107893   6.317
## factor(occupation)Tech-support               1.002134   0.130011   7.708
## factor(occupation)Transport-moving           0.362008   0.121020   2.991
## factor(relationship)Not-in-family            0.237710   0.249631   0.952
## factor(relationship)Other-relative          -0.590688   0.238881  -2.473
## factor(relationship)Own-child               -0.604378   0.246811  -2.449
## factor(relationship)Unmarried               -0.051128   0.264618  -0.193
## factor(relationship)Wife                     1.123711   0.095834  11.726
## factor(race)Asian-Pac-Islander               0.333622   0.223559   1.492
## factor(race)Black                            0.224368   0.213400   1.051
## factor(race)Other                            0.260960   0.304328   0.857
## factor(race)White                            0.485199   0.202687   2.394
## factor(sex)Male                              0.656458   0.072266   9.084
## factor(capital_gain)1                        1.692116   0.054581  31.002
## factor(capital_loss)1                        1.054260   0.068437  15.405
## factor(hours_week_bins)1                    -0.632965   0.056808 -11.142
## factor(hours_week_bins)2                     0.410302   0.037327  10.992
##                                             Pr(>|z|)    
## (Intercept)                                  < 2e-16 ***
## factor(age_bin)(24,44]                       < 2e-16 ***
## factor(age_bin)(44,64]                       < 2e-16 ***
## factor(age_bin)(64,140]                      < 2e-16 ***
## factor(education_level)1st-4th              9.67e-05 ***
## factor(education_level)5th-6th              8.85e-06 ***
## factor(education_level)7th-8th              6.74e-16 ***
## factor(education_level)9th                  2.42e-08 ***
## factor(education_level)10th                 7.17e-09 ***
## factor(education_level)11th                 2.02e-07 ***
## factor(education_level)12th                 0.029035 *  
## factor(education_level)Some-college         7.40e-14 ***
## factor(education_level)Assoc-voc            2.57e-08 ***
## factor(education_level)Assoc-acdm           2.45e-10 ***
## factor(education_level)Bachelors             < 2e-16 ***
## factor(education_level)Masters               < 2e-16 ***
## factor(education_level)Prof-school           < 2e-16 ***
## factor(education_level)Doctorate             < 2e-16 ***
## factor(marital_status)Married-AF-spouse     0.001016 ** 
## factor(marital_status)Married-civ-spouse    4.37e-15 ***
## factor(marital_status)Married-spouse-absent 0.554872    
## factor(marital_status)Never-married         8.07e-05 ***
## factor(marital_status)Separated             0.963460    
## factor(marital_status)Widowed               0.000793 ***
## factor(occupation)Adm-clerical              3.44e-05 ***
## factor(occupation)Armed-Forces              0.437431    
## factor(occupation)Craft-repair              0.000177 ***
## factor(occupation)Exec-managerial            < 2e-16 ***
## factor(occupation)Farming-fishing           0.000168 ***
## factor(occupation)Handlers-cleaners         0.161777    
## factor(occupation)Machine-op-inspct         0.982924    
## factor(occupation)Other-service             0.000116 ***
## factor(occupation)Priv-house-serv           0.149716    
## factor(occupation)Prof-specialty            1.30e-15 ***
## factor(occupation)Protective-serv           1.37e-08 ***
## factor(occupation)Sales                     2.66e-10 ***
## factor(occupation)Tech-support              1.28e-14 ***
## factor(occupation)Transport-moving          0.002778 ** 
## factor(relationship)Not-in-family           0.340974    
## factor(relationship)Other-relative          0.013409 *  
## factor(relationship)Own-child               0.014336 *  
## factor(relationship)Unmarried               0.846790    
## factor(relationship)Wife                     < 2e-16 ***
## factor(race)Asian-Pac-Islander              0.135616    
## factor(race)Black                           0.293078    
## factor(race)Other                           0.391171    
## factor(race)White                           0.016673 *  
## factor(sex)Male                              < 2e-16 ***
## factor(capital_gain)1                        < 2e-16 ***
## factor(capital_loss)1                        < 2e-16 ***
## factor(hours_week_bins)1                     < 2e-16 ***
## factor(hours_week_bins)2                     < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37544  on 34188  degrees of freedom
## Residual deviance: 22413  on 34137  degrees of freedom
## AIC: 22517
## 
## Number of Fisher Scoring iterations: 7
#car::vif(back.model2)
#tells me there is multicollinearity
#alias(back.model2)
#tells me multicollinearity between relationship and marital_status
#AIC: 22517

#will drop occupation instead of workclass
full.model3 <- glm(over_50k ~factor(age_bin) +factor(workclass) + factor(education_level) + factor(marital_status) +  factor(relationship) + factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins) ,
data =train, family =binomial(link ='logit'))

back.model3<-step(full.model3, direction ="backward")
## Start:  AIC=22951.42
## over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) + 
##     factor(marital_status) + factor(relationship) + factor(race) + 
##     factor(sex) + factor(capital_gain) + factor(capital_loss) + 
##     factor(hours_week_bins)
## 
##                           Df Deviance   AIC
## <none>                          22861 22951
## - factor(race)             4    22897 22979
## - factor(sex)              1    22936 23024
## - factor(marital_status)   6    22958 23036
## - factor(workclass)        7    23071 23147
## - factor(relationship)     5    23099 23179
## - factor(capital_loss)     1    23119 23207
## - factor(hours_week_bins)  2    23274 23360
## - factor(age_bin)          3    23377 23461
## - factor(capital_gain)     1    23915 24003
## - factor(education_level) 14    25528 25590
summary(back.model3)
## 
## Call:
## glm(formula = over_50k ~ factor(age_bin) + factor(workclass) + 
##     factor(education_level) + factor(marital_status) + factor(relationship) + 
##     factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + 
##     factor(hours_week_bins), family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7858  -0.5320  -0.1985  -0.0291   3.8386  
## 
## Coefficients:
##                                             Estimate Std. Error z value
## (Intercept)                                 -6.55990    0.37109 -17.677
## factor(age_bin)(24,44]                       1.72107    0.13590  12.664
## factor(age_bin)(44,64]                       2.25267    0.13799  16.325
## factor(age_bin)(64,140]                      1.67516    0.16088  10.413
## factor(workclass)Federal-gov                 1.14658    0.13178   8.700
## factor(workclass)Local-gov                   0.43173    0.11712   3.686
## factor(workclass)Private                     0.59598    0.10136   5.880
## factor(workclass)Self-emp-inc                1.10300    0.12378   8.911
## factor(workclass)Self-emp-not-inc            0.12487    0.11141   1.121
## factor(workclass)State-gov                   0.42683    0.12769   3.343
## factor(workclass)Without-pay                 0.27882    0.89504   0.312
## factor(education_level)1st-4th              -1.91824    0.39874  -4.811
## factor(education_level)5th-6th              -1.34128    0.24677  -5.435
## factor(education_level)7th-8th              -1.52884    0.17623  -8.675
## factor(education_level)9th                  -1.27860    0.20574  -6.215
## factor(education_level)10th                 -0.96367    0.15005  -6.422
## factor(education_level)11th                 -0.86837    0.14535  -5.974
## factor(education_level)12th                 -0.51566    0.20062  -2.570
## factor(education_level)Some-college          0.50811    0.04716  10.773
## factor(education_level)Assoc-voc             0.58688    0.07893   7.435
## factor(education_level)Assoc-acdm            0.80421    0.08948   8.988
## factor(education_level)Bachelors             1.45300    0.04720  30.782
## factor(education_level)Masters               1.83967    0.06871  26.774
## factor(education_level)Prof-school           2.52630    0.12193  20.719
## factor(education_level)Doctorate             2.50609    0.14047  17.841
## factor(marital_status)Married-AF-spouse      1.78076    0.57813   3.080
## factor(marital_status)Married-civ-spouse     1.91262    0.25477   7.507
## factor(marital_status)Married-spouse-absent  0.06119    0.20150   0.304
## factor(marital_status)Never-married         -0.34333    0.08006  -4.289
## factor(marital_status)Separated             -0.02776    0.14912  -0.186
## factor(marital_status)Widowed                0.41653    0.13781   3.022
## factor(relationship)Not-in-family            0.19817    0.25287   0.784
## factor(relationship)Other-relative          -0.72416    0.23594  -3.069
## factor(relationship)Own-child               -0.67090    0.25061  -2.677
## factor(relationship)Unmarried               -0.09594    0.26732  -0.359
## factor(relationship)Wife                     1.09846    0.09438  11.638
## factor(race)Asian-Pac-Islander               0.26079    0.22017   1.184
## factor(race)Black                            0.12229    0.20978   0.583
## factor(race)Other                            0.14093    0.29883   0.472
## factor(race)White                            0.47057    0.19951   2.359
## factor(sex)Male                              0.60703    0.07131   8.513
## factor(capital_gain)1                        1.69560    0.05410  31.343
## factor(capital_loss)1                        1.08203    0.06769  15.986
## factor(hours_week_bins)1                    -0.60240    0.05621 -10.717
## factor(hours_week_bins)2                     0.48532    0.03677  13.199
##                                             Pr(>|z|)    
## (Intercept)                                  < 2e-16 ***
## factor(age_bin)(24,44]                       < 2e-16 ***
## factor(age_bin)(44,64]                       < 2e-16 ***
## factor(age_bin)(64,140]                      < 2e-16 ***
## factor(workclass)Federal-gov                 < 2e-16 ***
## factor(workclass)Local-gov                  0.000227 ***
## factor(workclass)Private                    4.10e-09 ***
## factor(workclass)Self-emp-inc                < 2e-16 ***
## factor(workclass)Self-emp-not-inc           0.262366    
## factor(workclass)State-gov                  0.000830 ***
## factor(workclass)Without-pay                0.755408    
## factor(education_level)1st-4th              1.50e-06 ***
## factor(education_level)5th-6th              5.47e-08 ***
## factor(education_level)7th-8th               < 2e-16 ***
## factor(education_level)9th                  5.14e-10 ***
## factor(education_level)10th                 1.34e-10 ***
## factor(education_level)11th                 2.31e-09 ***
## factor(education_level)12th                 0.010159 *  
## factor(education_level)Some-college          < 2e-16 ***
## factor(education_level)Assoc-voc            1.05e-13 ***
## factor(education_level)Assoc-acdm            < 2e-16 ***
## factor(education_level)Bachelors             < 2e-16 ***
## factor(education_level)Masters               < 2e-16 ***
## factor(education_level)Prof-school           < 2e-16 ***
## factor(education_level)Doctorate             < 2e-16 ***
## factor(marital_status)Married-AF-spouse     0.002069 ** 
## factor(marital_status)Married-civ-spouse    6.04e-14 ***
## factor(marital_status)Married-spouse-absent 0.761376    
## factor(marital_status)Never-married         1.80e-05 ***
## factor(marital_status)Separated             0.852339    
## factor(marital_status)Widowed               0.002507 ** 
## factor(relationship)Not-in-family           0.433239    
## factor(relationship)Other-relative          0.002146 ** 
## factor(relationship)Own-child               0.007427 ** 
## factor(relationship)Unmarried               0.719682    
## factor(relationship)Wife                     < 2e-16 ***
## factor(race)Asian-Pac-Islander              0.236234    
## factor(race)Black                           0.559923    
## factor(race)Other                           0.637213    
## factor(race)White                           0.018343 *  
## factor(sex)Male                              < 2e-16 ***
## factor(capital_gain)1                        < 2e-16 ***
## factor(capital_loss)1                        < 2e-16 ***
## factor(hours_week_bins)1                     < 2e-16 ***
## factor(hours_week_bins)2                     < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37544  on 34188  degrees of freedom
## Residual deviance: 22861  on 34144  degrees of freedom
## AIC: 22951
## 
## Number of Fisher Scoring iterations: 7
#car::vif(back.model3)
#tells me there is multicollinearity
#alias(back.model3)
#tells me multicollinearity between relationship and marital_status
#AIC: 22951

#it would be fine to remove either workclass or occupation.
#then I saw high VIF with relationship and marital status in both models.
#I will remove relationship

#Drops occupation and relationship
full.model4 <- glm(over_50k ~factor(age_bin) +factor(workclass) + factor(education_level) + factor(marital_status) + factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins) ,
data =train, family =binomial(link ='logit'))

back.model4<-step(full.model4, direction ="backward")
## Start:  AIC=23179.11
## over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) + 
##     factor(marital_status) + factor(race) + factor(sex) + factor(capital_gain) + 
##     factor(capital_loss) + factor(hours_week_bins)
## 
##                           Df Deviance   AIC
## - factor(sex)              1    23100 23178
## <none>                          23099 23179
## - factor(race)             4    23140 23212
## - factor(workclass)        7    23308 23374
## - factor(capital_loss)     1    23366 23444
## - factor(hours_week_bins)  2    23498 23574
## - factor(age_bin)          3    23679 23753
## - factor(capital_gain)     1    24185 24263
## - factor(education_level) 14    25840 25892
## - factor(marital_status)   6    26674 26742
## 
## Step:  AIC=23177.78
## over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) + 
##     factor(marital_status) + factor(race) + factor(capital_gain) + 
##     factor(capital_loss) + factor(hours_week_bins)
## 
##                           Df Deviance   AIC
## <none>                          23100 23178
## - factor(race)             4    23141 23211
## - factor(workclass)        7    23309 23373
## - factor(capital_loss)     1    23367 23443
## - factor(hours_week_bins)  2    23526 23600
## - factor(age_bin)          3    23682 23754
## - factor(capital_gain)     1    24186 24262
## - factor(education_level) 14    25840 25890
## - factor(marital_status)   6    27484 27550
summary(back.model4)
## 
## Call:
## glm(formula = over_50k ~ factor(age_bin) + factor(workclass) + 
##     factor(education_level) + factor(marital_status) + factor(race) + 
##     factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins), 
##     family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8141  -0.5402  -0.2174  -0.0403   3.7431  
## 
## Coefficients:
##                                             Estimate Std. Error z value
## (Intercept)                                 -6.30893    0.26333 -23.958
## factor(age_bin)(24,44]                       1.85108    0.13399  13.815
## factor(age_bin)(44,64]                       2.37249    0.13597  17.449
## factor(age_bin)(64,140]                      1.77199    0.15848  11.181
## factor(workclass)Federal-gov                 1.12980    0.13091   8.630
## factor(workclass)Local-gov                   0.38714    0.11613   3.334
## factor(workclass)Private                     0.57198    0.10051   5.691
## factor(workclass)Self-emp-inc                1.08642    0.12330   8.811
## factor(workclass)Self-emp-not-inc            0.10843    0.11075   0.979
## factor(workclass)State-gov                   0.39558    0.12694   3.116
## factor(workclass)Without-pay                 0.01135    0.89762   0.013
## factor(education_level)1st-4th              -1.92713    0.39855  -4.835
## factor(education_level)5th-6th              -1.35909    0.24662  -5.511
## factor(education_level)7th-8th              -1.54640    0.17624  -8.775
## factor(education_level)9th                  -1.27951    0.20589  -6.215
## factor(education_level)10th                 -0.97669    0.15024  -6.501
## factor(education_level)11th                 -0.89032    0.14507  -6.137
## factor(education_level)12th                 -0.49864    0.20031  -2.489
## factor(education_level)Some-college          0.50511    0.04702  10.742
## factor(education_level)Assoc-voc             0.58309    0.07868   7.411
## factor(education_level)Assoc-acdm            0.81473    0.08903   9.152
## factor(education_level)Bachelors             1.46715    0.04706  31.175
## factor(education_level)Masters               1.85584    0.06834  27.154
## factor(education_level)Prof-school           2.55915    0.12197  20.982
## factor(education_level)Doctorate             2.52417    0.14007  18.021
## factor(marital_status)Married-AF-spouse      2.29529    0.51246   4.479
## factor(marital_status)Married-civ-spouse     2.20917    0.05884  37.547
## factor(marital_status)Married-spouse-absent  0.11898    0.19877   0.599
## factor(marital_status)Never-married         -0.30710    0.07569  -4.057
## factor(marital_status)Separated             -0.04697    0.14652  -0.321
## factor(marital_status)Widowed                0.26081    0.13455   1.938
## factor(race)Asian-Pac-Islander               0.20433    0.21812   0.937
## factor(race)Black                            0.09303    0.20797   0.447
## factor(race)Other                            0.10420    0.29744   0.350
## factor(race)White                            0.45963    0.19788   2.323
## factor(capital_gain)1                        1.71755    0.05384  31.900
## factor(capital_loss)1                        1.10210    0.06764  16.293
## factor(hours_week_bins)1                    -0.55473    0.05416 -10.243
## factor(hours_week_bins)2                     0.49529    0.03654  13.555
##                                             Pr(>|z|)    
## (Intercept)                                  < 2e-16 ***
## factor(age_bin)(24,44]                       < 2e-16 ***
## factor(age_bin)(44,64]                       < 2e-16 ***
## factor(age_bin)(64,140]                      < 2e-16 ***
## factor(workclass)Federal-gov                 < 2e-16 ***
## factor(workclass)Local-gov                  0.000857 ***
## factor(workclass)Private                    1.26e-08 ***
## factor(workclass)Self-emp-inc                < 2e-16 ***
## factor(workclass)Self-emp-not-inc           0.327586    
## factor(workclass)State-gov                  0.001831 ** 
## factor(workclass)Without-pay                0.989910    
## factor(education_level)1st-4th              1.33e-06 ***
## factor(education_level)5th-6th              3.57e-08 ***
## factor(education_level)7th-8th               < 2e-16 ***
## factor(education_level)9th                  5.15e-10 ***
## factor(education_level)10th                 7.98e-11 ***
## factor(education_level)11th                 8.41e-10 ***
## factor(education_level)12th                 0.012798 *  
## factor(education_level)Some-college          < 2e-16 ***
## factor(education_level)Assoc-voc            1.26e-13 ***
## factor(education_level)Assoc-acdm            < 2e-16 ***
## factor(education_level)Bachelors             < 2e-16 ***
## factor(education_level)Masters               < 2e-16 ***
## factor(education_level)Prof-school           < 2e-16 ***
## factor(education_level)Doctorate             < 2e-16 ***
## factor(marital_status)Married-AF-spouse     7.50e-06 ***
## factor(marital_status)Married-civ-spouse     < 2e-16 ***
## factor(marital_status)Married-spouse-absent 0.549463    
## factor(marital_status)Never-married         4.96e-05 ***
## factor(marital_status)Separated             0.748540    
## factor(marital_status)Widowed               0.052583 .  
## factor(race)Asian-Pac-Islander              0.348872    
## factor(race)Black                           0.654645    
## factor(race)Other                           0.726089    
## factor(race)White                           0.020193 *  
## factor(capital_gain)1                        < 2e-16 ***
## factor(capital_loss)1                        < 2e-16 ***
## factor(hours_week_bins)1                     < 2e-16 ***
## factor(hours_week_bins)2                     < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37544  on 34188  degrees of freedom
## Residual deviance: 23100  on 34150  degrees of freedom
## AIC: 23178
## 
## Number of Fisher Scoring iterations: 7
car::vif(back.model4)
##                             GVIF Df GVIF^(1/(2*Df))
## factor(age_bin)         1.263886  3        1.039804
## factor(workclass)       1.257626  7        1.016508
## factor(education_level) 1.211158 14        1.006866
## factor(marital_status)  1.204352  6        1.015616
## factor(race)            1.053393  4        1.006523
## factor(capital_gain)    1.033554  1        1.016639
## factor(capital_loss)    1.012956  1        1.006457
## factor(hours_week_bins) 1.157862  2        1.037323
#no more issues
#AIC: 23178

#Drops workclass and relationship
full.model5 <- glm(over_50k ~factor(age_bin) +factor(occupation) + factor(education_level) + factor(marital_status) + factor(race) + factor(sex) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins) ,
data =train, family =binomial(link ='logit'))

back.model5<-step(full.model5, direction ="backward")
## Start:  AIC=22735.28
## over_50k ~ factor(age_bin) + factor(occupation) + factor(education_level) + 
##     factor(marital_status) + factor(race) + factor(sex) + factor(capital_gain) + 
##     factor(capital_loss) + factor(hours_week_bins)
## 
##                           Df Deviance   AIC
## <none>                          22641 22735
## - factor(sex)              1    22645 22737
## - factor(race)             4    22666 22752
## - factor(capital_loss)     1    22888 22980
## - factor(hours_week_bins)  2    22972 23062
## - factor(age_bin)          3    23152 23240
## - factor(occupation)      14    23308 23374
## - factor(capital_gain)     1    23702 23794
## - factor(education_level) 14    23832 23898
## - factor(marital_status)   6    26146 26228
summary(back.model5)
## 
## Call:
## glm(formula = over_50k ~ factor(age_bin) + factor(occupation) + 
##     factor(education_level) + factor(marital_status) + factor(race) + 
##     factor(sex) + factor(capital_gain) + factor(capital_loss) + 
##     factor(hours_week_bins), family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7726  -0.5297  -0.2019  -0.0344   3.9054  
## 
## Coefficients:
##                                             Estimate Std. Error z value
## (Intercept)                                 -6.11502    0.26556 -23.027
## factor(age_bin)(24,44]                       1.76584    0.13489  13.091
## factor(age_bin)(44,64]                       2.25618    0.13681  16.491
## factor(age_bin)(64,140]                      1.63468    0.15948  10.250
## factor(occupation)Adm-clerical               0.47107    0.11146   4.226
## factor(occupation)Armed-Forces               0.93719    1.22759   0.763
## factor(occupation)Craft-repair               0.35762    0.10617   3.368
## factor(occupation)Exec-managerial            1.14540    0.10416  10.996
## factor(occupation)Farming-fishing           -0.60360    0.14618  -4.129
## factor(occupation)Handlers-cleaners         -0.25570    0.15055  -1.699
## factor(occupation)Machine-op-inspct         -0.03162    0.12309  -0.257
## factor(occupation)Other-service             -0.52044    0.13501  -3.855
## factor(occupation)Priv-house-serv           -1.68443    1.02283  -1.647
## factor(occupation)Prof-specialty             0.84133    0.10650   7.900
## factor(occupation)Protective-serv            0.76197    0.14158   5.382
## factor(occupation)Sales                      0.65073    0.10713   6.074
## factor(occupation)Tech-support               0.96869    0.12898   7.510
## factor(occupation)Transport-moving           0.32133    0.12049   2.667
## factor(education_level)1st-4th              -1.59813    0.40773  -3.920
## factor(education_level)5th-6th              -1.12758    0.25262  -4.464
## factor(education_level)7th-8th              -1.44945    0.17750  -8.166
## factor(education_level)9th                  -1.15399    0.20896  -5.522
## factor(education_level)10th                 -0.88530    0.15126  -5.853
## factor(education_level)11th                 -0.78432    0.14679  -5.343
## factor(education_level)12th                 -0.43070    0.20357  -2.116
## factor(education_level)Some-college          0.35560    0.04837   7.352
## factor(education_level)Assoc-voc             0.44212    0.08031   5.505
## factor(education_level)Assoc-acdm            0.58761    0.09100   6.457
## factor(education_level)Bachelors             1.10212    0.05159  21.363
## factor(education_level)Masters               1.37177    0.07370  18.613
## factor(education_level)Prof-school           2.10833    0.12888  16.358
## factor(education_level)Doctorate             2.02924    0.14484  14.010
## factor(marital_status)Married-AF-spouse      2.43657    0.52844   4.611
## factor(marital_status)Married-civ-spouse     2.21403    0.06278  35.264
## factor(marital_status)Married-spouse-absent  0.18291    0.20140   0.908
## factor(marital_status)Never-married         -0.28392    0.07637  -3.718
## factor(marital_status)Separated             -0.01666    0.14724  -0.113
## factor(marital_status)Widowed                0.32228    0.13635   2.364
## factor(race)Asian-Pac-Islander               0.29116    0.22137   1.315
## factor(race)Black                            0.20205    0.21141   0.956
## factor(race)Other                            0.23084    0.30233   0.764
## factor(race)White                            0.47972    0.20088   2.388
## factor(sex)Male                              0.09206    0.04943   1.862
## factor(capital_gain)1                        1.71141    0.05429  31.523
## factor(capital_loss)1                        1.07132    0.06837  15.670
## factor(hours_week_bins)1                    -0.56970    0.05550 -10.265
## factor(hours_week_bins)2                     0.42024    0.03725  11.281
##                                             Pr(>|z|)    
## (Intercept)                                  < 2e-16 ***
## factor(age_bin)(24,44]                       < 2e-16 ***
## factor(age_bin)(44,64]                       < 2e-16 ***
## factor(age_bin)(64,140]                      < 2e-16 ***
## factor(occupation)Adm-clerical              2.37e-05 ***
## factor(occupation)Armed-Forces              0.445202    
## factor(occupation)Craft-repair              0.000756 ***
## factor(occupation)Exec-managerial            < 2e-16 ***
## factor(occupation)Farming-fishing           3.64e-05 ***
## factor(occupation)Handlers-cleaners         0.089412 .  
## factor(occupation)Machine-op-inspct         0.797262    
## factor(occupation)Other-service             0.000116 ***
## factor(occupation)Priv-house-serv           0.099592 .  
## factor(occupation)Prof-specialty            2.79e-15 ***
## factor(occupation)Protective-serv           7.38e-08 ***
## factor(occupation)Sales                     1.24e-09 ***
## factor(occupation)Tech-support              5.90e-14 ***
## factor(occupation)Transport-moving          0.007656 ** 
## factor(education_level)1st-4th              8.87e-05 ***
## factor(education_level)5th-6th              8.06e-06 ***
## factor(education_level)7th-8th              3.19e-16 ***
## factor(education_level)9th                  3.34e-08 ***
## factor(education_level)10th                 4.83e-09 ***
## factor(education_level)11th                 9.13e-08 ***
## factor(education_level)12th                 0.034373 *  
## factor(education_level)Some-college         1.95e-13 ***
## factor(education_level)Assoc-voc            3.69e-08 ***
## factor(education_level)Assoc-acdm           1.06e-10 ***
## factor(education_level)Bachelors             < 2e-16 ***
## factor(education_level)Masters               < 2e-16 ***
## factor(education_level)Prof-school           < 2e-16 ***
## factor(education_level)Doctorate             < 2e-16 ***
## factor(marital_status)Married-AF-spouse     4.01e-06 ***
## factor(marital_status)Married-civ-spouse     < 2e-16 ***
## factor(marital_status)Married-spouse-absent 0.363769    
## factor(marital_status)Never-married         0.000201 ***
## factor(marital_status)Separated             0.909933    
## factor(marital_status)Widowed               0.018095 *  
## factor(race)Asian-Pac-Islander              0.188423    
## factor(race)Black                           0.339232    
## factor(race)Other                           0.445133    
## factor(race)White                           0.016934 *  
## factor(sex)Male                             0.062568 .  
## factor(capital_gain)1                        < 2e-16 ***
## factor(capital_loss)1                        < 2e-16 ***
## factor(hours_week_bins)1                     < 2e-16 ***
## factor(hours_week_bins)2                     < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 37544  on 34188  degrees of freedom
## Residual deviance: 22641  on 34142  degrees of freedom
## AIC: 22735
## 
## Number of Fisher Scoring iterations: 7
car::vif(back.model5)
##                             GVIF Df GVIF^(1/(2*Df))
## factor(age_bin)         1.269761  3        1.040608
## factor(occupation)      2.140002 14        1.027544
## factor(education_level) 1.743602 14        1.020054
## factor(marital_status)  1.490994  6        1.033847
## factor(race)            1.054687  4        1.006678
## factor(sex)             1.444416  1        1.201839
## factor(capital_gain)    1.035292  1        1.017493
## factor(capital_loss)    1.013611  1        1.006782
## factor(hours_week_bins) 1.230957  2        1.053321
#no more issues
#AIC: 22735


#I have two models workclass (full.model4) and occupation (full.model5)
#higher AIC than original but I feel better about multicollinearity. 
# Start:  AIC=23178
# over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) + 
#     factor(marital_status) + factor(race) + factor(capital_gain) + 
#     factor(capital_loss) + factor(hours_week_bins)

#higher AIC than original but I feel better about multicollinearity.
# Start:  AIC=22735
# over_50k ~ factor(age_bin) + factor(occupation) + factor(education_level) + 
#     factor(marital_status) + factor(race) + factor(capital_gain) + 
#     factor(capital_loss) + factor(hours_week_bins)



#Looking at adding interactions into the model:
#would normally bring in interactions and look using forward selection. **Here I will forego this step**
# full.model <- glm(over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) + 
#     factor(marital_status) + factor(race) + factor(capital_gain) + 
#     factor(capital_loss) + factor(hours_week_bins) ,
# data =train, family =binomial(link ='logit'))
# summary(full.model)
# AIC(full.model)




#Looking at Probability Based Metrics for Assessing Predictive Power: full.model4
#looking at Coefficient of Discrimination (re-do on validation and test!)
train$p_hat<-predict(full.model4, type ="response") 
p1 <-train$p_hat[train$over_50k==1] 
p0 <-train$p_hat[train$over_50k==0] 

coef_discrim<-mean(p1) -mean(p0)
#0.404

ggplot(train, aes(p_hat, fill =factor(over_50k))) +geom_density(alpha =0.7) +scale_fill_grey() +labs(x ="Predicted Probability", fill ="Outcome", title =paste("Coefficient of Discrimination = ", round(coef_discrim, 3), sep=""))

InformationValue::Concordance(train$over_50k, train$p_hat)
## $Concordance
## [1] 0.8922262
## 
## $Discordance
## [1] 0.1077738
## 
## $Tied
## [1] 4.163336e-17
## 
## $Pairs
## [1] 212146278
# $Concordance
# [1] 0.8922262
# 
# $Discordance
# [1] 0.1077738
# 
# $Tied
# [1] 4.163336e-17
# 
# $Pairs
# [1] 212146278
InformationValue::somersD(train$over_50k, train$p_hat)
## [1] 0.7844523
#0.7844523

library(InformationValue)
sens<-NULL
spec <-NULL
youden<-NULL
cutoff <-NULL
for(i in 1:49){ 
    cutoff =c(cutoff, i/50) 
    sens<-c(sens, sensitivity(train$over_50k, train$p_hat, threshold =i/50)) 
    spec <-c(spec, specificity(train$over_50k, train$p_hat, threshold =i/50)) 
    youden<-c(youden, youdensIndex(train$over_50k, train$p_hat, threshold =i/50)) 
} 
ctable<-data.frame(cutoff, sens, spec, youden) 

print(ctable[order(-youden),])
##    cutoff        sens      spec      youden
## 9    0.18 0.898109502 0.7225358 0.620645308
## 10   0.20 0.883746624 0.7366663 0.620412907
## 11   0.22 0.871716180 0.7468802 0.618596339
## 12   0.24 0.824330960 0.7922282 0.616559198
## 8    0.16 0.908175792 0.7069846 0.615160394
## 13   0.26 0.811932237 0.8009062 0.612838430
## 14   0.28 0.804443899 0.8064355 0.610879409
## 15   0.30 0.790817579 0.8153823 0.606199832
## 7    0.14 0.927940093 0.6753062 0.603246318
## 16   0.32 0.781242328 0.8213723 0.602614673
## 6    0.12 0.938006384 0.6553008 0.593307232
## 5    0.10 0.948318193 0.6287678 0.577086000
## 17   0.34 0.703044439 0.8715970 0.574641413
## 18   0.36 0.688313283 0.8801213 0.568434620
## 19   0.38 0.680333906 0.8840379 0.564371843
## 20   0.40 0.673336607 0.8873786 0.560715173
## 21   0.42 0.656150258 0.8942518 0.550402072
## 22   0.44 0.648539160 0.8979764 0.546515584
## 4    0.08 0.963663148 0.5796951 0.543358267
## 23   0.46 0.586545544 0.9203241 0.506869623
## 3    0.06 0.972992880 0.5323119 0.505304826
## 24   0.48 0.561870857 0.9289252 0.490796096
## 25   0.50 0.554750798 0.9312291 0.485979919
## 26   0.52 0.542843113 0.9345314 0.477374465
## 27   0.54 0.529462313 0.9379104 0.467372692
## 28   0.56 0.516695311 0.9406750 0.457370348
## 2    0.04 0.982936411 0.4519065 0.434842873
## 29   0.58 0.466118340 0.9523864 0.418504778
## 30   0.60 0.448931991 0.9566102 0.405542213
## 31   0.62 0.435428431 0.9593749 0.394803311
## 32   0.64 0.419224159 0.9635603 0.382784425
## 33   0.66 0.403019887 0.9664785 0.369498403
## 34   0.68 0.361404370 0.9726606 0.334064970
## 35   0.70 0.335624847 0.9775372 0.313161997
## 36   0.72 0.322121287 0.9793035 0.301424746
## 1    0.02 0.995335134 0.3052644 0.300599504
## 37   0.74 0.293641051 0.9833353 0.276976304
## 38   0.76 0.270684999 0.9864071 0.257092095
## 39   0.78 0.252393813 0.9885958 0.240989597
## 40   0.80 0.217652836 0.9912836 0.208936482
## 41   0.82 0.197520255 0.9932035 0.190723803
## 42   0.84 0.170635895 0.9949699 0.165605752
## 43   0.86 0.153326786 0.9961602 0.149486983
## 44   0.88 0.123005156 0.9970434 0.120048507
## 45   0.90 0.108151240 0.9977345 0.105885756
## 46   0.92 0.083108274 0.9987329 0.081841139
## 47   0.94 0.057451510 0.9990784 0.056529957
## 48   0.96 0.028971274 0.9996160 0.028587294
## 49   0.98 0.008593175 0.9999616 0.008554777
#optimal cutoff of .18
confusionMatrix(train$over_50k, train$p_hat, threshold =0.18)
##       0    1
## 0 18817  830
## 1  7226 7316
plotROC(train$over_50k, train$p_hat)

#concordance 0.8926


#Looking at Probability Based Metrics for Assessing Predictive Power:
#looking at Coefficient of Discrimination (re-do on validation and test!)
train$p_hat<-predict(full.model5, type ="response") 
p1 <-train$p_hat[train$over_50k==1] 
p0 <-train$p_hat[train$over_50k==0] 

coef_discrim<-mean(p1) -mean(p0)
#0.404

ggplot(train, aes(p_hat, fill =factor(over_50k))) +geom_density(alpha =0.7) +scale_fill_grey() +labs(x ="Predicted Probability", fill ="Outcome", title =paste("Coefficient of Discrimination = ", round(coef_discrim, 3), sep=""))

InformationValue::Concordance(train$over_50k, train$p_hat)
## $Concordance
## [1] 0.8982773
## 
## $Discordance
## [1] 0.1017227
## 
## $Tied
## [1] -4.163336e-17
## 
## $Pairs
## [1] 212146278
# $Concordance
# [1] 0.8982773
# 
# $Discordance
# [1] 0.1017227
# 
# $Tied
# [1] -4.163336e-17
# 
# $Pairs
# [1] 212146278
InformationValue::somersD(train$over_50k, train$p_hat)
## [1] 0.7965545
#0.7965545

library(InformationValue)
sens<-NULL
spec <-NULL
youden<-NULL
cutoff <-NULL
for(i in 1:49){ 
    cutoff =c(cutoff, i/50) 
    sens<-c(sens, sensitivity(train$over_50k, train$p_hat, threshold =i/50)) 
    spec <-c(spec, specificity(train$over_50k, train$p_hat, threshold =i/50)) 
    youden<-c(youden, youdensIndex(train$over_50k, train$p_hat, threshold =i/50)) 
} 
ctable<-data.frame(cutoff, sens, spec, youden) 

print(ctable[order(-youden),])
##    cutoff        sens      spec      youden
## 10   0.20 0.880554874 0.7544062 0.634961048
## 11   0.22 0.867787871 0.7667703 0.634558213
## 9    0.18 0.895654309 0.7365511 0.632205397
## 12   0.24 0.833169654 0.7957609 0.628930511
## 8    0.16 0.911735821 0.7138195 0.625555274
## 13   0.26 0.815492266 0.8078946 0.623386902
## 14   0.28 0.803952860 0.8162654 0.620218268
## 15   0.30 0.787257550 0.8290520 0.616309502
## 7    0.14 0.926221458 0.6859809 0.612202336
## 16   0.32 0.761478026 0.8470990 0.608577055
## 17   0.34 0.734348146 0.8638022 0.598150320
## 6    0.12 0.940216057 0.6547633 0.594979333
## 18   0.36 0.719985269 0.8713282 0.591313457
## 19   0.38 0.703780997 0.8797374 0.583518354
## 5    0.10 0.950896145 0.6258112 0.576707304
## 20   0.40 0.681070464 0.8904888 0.571559271
## 21   0.42 0.659587528 0.8997043 0.559291863
## 4    0.08 0.962681070 0.5885267 0.551207738
## 22   0.44 0.638472870 0.9076143 0.546087200
## 23   0.46 0.624846550 0.9122605 0.537107043
## 24   0.48 0.603609133 0.9191721 0.522781272
## 3    0.06 0.975325313 0.5352686 0.510593907
## 25   0.50 0.583599313 0.9268901 0.510489456
## 26   0.52 0.556714952 0.9352609 0.491975867
## 27   0.54 0.544316229 0.9387167 0.483032967
## 28   0.56 0.530076111 0.9424797 0.472555856
## 29   0.58 0.510802848 0.9472411 0.458043949
## 2    0.04 0.985637123 0.4616212 0.447258288
## 30   0.60 0.479499141 0.9543831 0.433882276
## 31   0.62 0.460962436 0.9581461 0.419108578
## 32   0.64 0.438374662 0.9624467 0.400821385
## 33   0.66 0.411122023 0.9670161 0.378138112
## 34   0.68 0.388779769 0.9709327 0.359712457
## 35   0.70 0.364718880 0.9748493 0.339568168
## 1    0.02 0.995457893 0.3417041 0.337161998
## 36   0.72 0.337097962 0.9784587 0.315556665
## 37   0.74 0.316965382 0.9804938 0.297459180
## 38   0.76 0.274245028 0.9857159 0.259960960
## 39   0.78 0.256199362 0.9875974 0.243796797
## 40   0.80 0.227841886 0.9903237 0.218165581
## 41   0.82 0.209427940 0.9917444 0.201172363
## 42   0.84 0.183157378 0.9939715 0.177128887
## 43   0.86 0.150012276 0.9961218 0.146134075
## 44   0.88 0.130002455 0.9969282 0.126930612
## 45   0.90 0.108274000 0.9978113 0.106085312
## 46   0.92 0.080653081 0.9987329 0.079385946
## 47   0.94 0.060152222 0.9991168 0.059269067
## 48   0.96 0.031549227 0.9997312 0.031280440
## 49   0.98 0.007242819 1.0000000 0.007242819
#optimal cutoff of .20
confusionMatrix(train$over_50k, train$p_hat, threshold =0.20)
##       0    1
## 0 19647  973
## 1  6396 7173
plotROC(train$over_50k, train$p_hat)

#concordance 0.8981

Final Models Selected:

full.model4 AIC=23178 over_50k ~ factor(age_bin) + factor(workclass) + factor(education_level) + factor(marital_status) + factor(race) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins)

full.model5 AIC=22735 over_50k ~ factor(age_bin) + factor(occupation) + factor(education_level) + factor(marital_status) + factor(race) + factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins)

Building the model

#STEP 3: Validation

#preform all binning to validate:
validate$capital_gain<- ifelse(validate$capital_gain == 0, 0, 1)
validate$capital_loss<- ifelse(validate$capital_loss == 0, 0, 1)
validate$hours_week = ifelse(validate$hours_week < 40, 1, validate$hours_week)
validate$hours_week_bin = validate$hours_week
validate$hours_week_bin = ifelse(validate$hours_week_bin > 40, 2, validate$hours_week_bin)
validate$hours_week_bins = validate$hours_week_bin
validate$hours_week_bins = ifelse(validate$hours_week_bins == 40, 0, validate$hours_week_bins)
validate <- validate%>%mutate(age_bin = cut(age, breaks = c(0,24,44,64,140)))
head(validate,10)
##    id age workclass education_level education_num     marital_status
## 4   4  53   Private            11th             7 Married-civ-spouse
## 10 10  42   Private       Bachelors            13 Married-civ-spouse
## 11 11  37   Private    Some-college            10 Married-civ-spouse
## 33 33  45   Private       Bachelors            13           Divorced
## 43 43  24   Private       Bachelors            13 Married-civ-spouse
## 44 44  49   Private         HS-grad             9          Separated
## 49 49  41 State-gov       Assoc-voc            11 Married-civ-spouse
## 52 52  18   Private         HS-grad             9      Never-married
## 53 53  47   Private     Prof-school            15 Married-civ-spouse
## 56 56  43   Private    Some-college            10 Married-civ-spouse
##           occupation relationship  race    sex capital_gain capital_loss
## 4  Handlers-cleaners      Husband Black   Male            0            0
## 10   Exec-managerial      Husband White   Male            1            0
## 11   Exec-managerial      Husband Black   Male            0            0
## 33   Exec-managerial    Own-child White   Male            0            1
## 43      Tech-support      Husband White   Male            0            0
## 44      Adm-clerical    Unmarried White Female            0            0
## 49      Craft-repair      Husband White   Male            0            0
## 52     Other-service    Own-child White Female            0            0
## 53    Prof-specialty         Wife White Female            0            1
## 56      Tech-support      Husband White   Male            0            0
##    hours_week       country over_50k hours_week_bin hours_week_bins age_bin
## 4          40 United-States        0             40               0 (44,64]
## 10         40 United-States        1             40               0 (24,44]
## 11         80 United-States        1              2               2 (24,44]
## 33         40 United-States        0             40               0 (44,64]
## 43         50 United-States        0              2               2  (0,24]
## 44         40 United-States        0             40               0 (44,64]
## 49         40 United-States        0             40               0 (24,44]
## 52          1             ?        0              1               1  (0,24]
## 53         60      Honduras        1              2               2 (44,64]
## 56         40 United-States        1             40               0 (24,44]
validate$country<- ifelse(validate$country == 'Holand-Netherlands'| validate$country == 'Honduras' |validate$country == 'Loas', '?', validate$country)
validate$education_level<- ifelse(validate$education_level == 'Preschool', '1st-4th', validate$education_level)
validate$workclass<- ifelse(validate$workclass == 'Never-worked', '?', validate$workclass)

#Re-leveling Education_level
validate$education_level <- factor(validate$education_level, levels = c("HS-grad", "Preschool", "1st-4th", "5th-6th","7th-8th","9th","10th","11th","12th","Some-college","Assoc-voc","Assoc-acdm","Bachelors","Masters","Prof-school","Doctorate"))
validate %>% count(education_level, sort = TRUE)
##    education_level    n
## 1          HS-grad 3193
## 2     Some-college 2151
## 3        Bachelors 1606
## 4          Masters  540
## 5        Assoc-voc  440
## 6             11th  339
## 7       Assoc-acdm  312
## 8             10th  269
## 9          7th-8th  177
## 10     Prof-school  163
## 11             9th  149
## 12       Doctorate  131
## 13            12th  122
## 14         5th-6th  103
## 15         1st-4th   73
cat_var2 <- validate %>% 
  dplyr::select(over_50k,
                workclass, 
                education_level,
                marital_status,
                occupation,
                relationship,
                race,
                sex,
                country,
                age_bin,
                capital_gain,
                capital_loss,
                hours_week_bins)

#running full.model4 on validate
new_data<-data.frame(validate, 
                        'Pred'=predict(full.model4, newdata=validate, 
                                        type ="response"))


#Looking at Probability Based Metrics for Assessing Predictive Power: full.model4
#looking at Coefficient of Discrimination (re-do on validation and test!)
validate$p_hat<-predict(full.model4, newdata=validate, type ="response") 
p1 <-validate$p_hat[validate$over_50k==1] 
p0 <-validate$p_hat[validate$over_50k==0] 

coef_discrim<-mean(p1) -mean(p0)
coef_discrim
## [1] 0.4027499
#0.4027499


ggplot(validate, aes(p_hat, fill =factor(over_50k))) +geom_density(alpha =0.7) +scale_fill_grey() +labs(x ="Predicted Probability", fill ="Outcome", title =paste("Coefficient of Discrimination = ", round(coef_discrim, 3), sep=""))

InformationValue::Concordance(validate$over_50k, validate$p_hat)
## $Concordance
## [1] 0.8913373
## 
## $Discordance
## [1] 0.1086627
## 
## $Tied
## [1] 1.387779e-17
## 
## $Pairs
## [1] 17452556
# $Concordance
# [1] 0.8913373
# 
# $Discordance
# [1] 0.1086627
# 
# $Tied
# [1] 1.387779e-17
# 
# $Pairs
# [1] 17452556
InformationValue::somersD(validate$over_50k, validate$p_hat)
## [1] 0.7826747
#0.7826747

library(InformationValue)
sens<-NULL
spec <-NULL
youden<-NULL
cutoff <-NULL
for(i in 1:49){ 
    cutoff =c(cutoff, i/50) 
    sens<-c(sens, sensitivity(validate$over_50k, validate$p_hat, threshold =i/50)) 
    spec <-c(spec, specificity(validate$over_50k, validate$p_hat, threshold =i/50)) 
    youden<-c(youden, youdensIndex(validate$over_50k, validate$p_hat, threshold =i/50)) 
} 
ctable<-data.frame(cutoff, sens, spec, youden) 

print(ctable[order(-youden),])
##    cutoff        sens      spec      youden
## 12   0.24 0.833050127 0.7881036 0.621153715
## 13   0.26 0.820730671 0.7966010 0.617331696
## 11   0.22 0.875106202 0.7421095 0.617215725
## 9    0.18 0.897196262 0.7199892 0.617185471
## 14   0.28 0.814358539 0.8019962 0.616354762
## 10   0.20 0.883602379 0.7325330 0.616135425
## 8    0.16 0.908241291 0.7048827 0.613123946
## 15   0.30 0.798215803 0.8126517 0.610867543
## 16   0.32 0.786745964 0.8183167 0.605062662
## 7    0.14 0.927357689 0.6754788 0.602836513
## 6    0.12 0.941801189 0.6503912 0.592192341
## 17   0.34 0.712404418 0.8656596 0.578063981
## 18   0.36 0.698385726 0.8744268 0.572812487
## 5    0.10 0.950297366 0.6193688 0.569666128
## 19   0.38 0.687340697 0.8776639 0.565004576
## 20   0.40 0.677994902 0.8796871 0.557681981
## 21   0.42 0.659728122 0.8865660 0.546294079
## 22   0.44 0.652931181 0.8910170 0.543948176
## 4    0.08 0.964316058 0.5691934 0.533509476
## 23   0.46 0.595581988 0.9171837 0.512765695
## 24   0.48 0.573067120 0.9270299 0.500097063
## 3    0.06 0.975785896 0.5200971 0.495883010
## 25   0.50 0.562871708 0.9305368 0.493408530
## 26   0.52 0.550127443 0.9339088 0.484036264
## 27   0.54 0.537383178 0.9363367 0.473719838
## 28   0.56 0.523364486 0.9393040 0.462668505
## 2    0.04 0.986406117 0.4398435 0.426249656
## 29   0.58 0.467714528 0.9500944 0.417808944
## 30   0.60 0.442650807 0.9548152 0.397466022
## 31   0.62 0.429906542 0.9575128 0.387419356
## 32   0.64 0.412064571 0.9612895 0.373354023
## 33   0.66 0.399320306 0.9649312 0.364251517
## 34   0.68 0.356414613 0.9707310 0.327145663
## 35   0.70 0.332625319 0.9761262 0.308751566
## 36   0.72 0.319456245 0.9784192 0.297875452
## 1    0.02 0.995751912 0.2862153 0.281967180
## 37   0.74 0.291843670 0.9820610 0.273904636
## 38   0.76 0.272302464 0.9840842 0.256386629
## 39   0.78 0.254885302 0.9858376 0.240722906
## 40   0.80 0.215802889 0.9892096 0.205012492
## 41   0.82 0.200509771 0.9906933 0.191203054
## 42   0.84 0.172047579 0.9924467 0.164494301
## 43   0.86 0.154205607 0.9946048 0.148810409
## 44   0.88 0.124044180 0.9967629 0.120807061
## 45   0.90 0.107901444 0.9973024 0.105203845
## 46   0.92 0.080713679 0.9979768 0.078690479
## 47   0.94 0.053950722 0.9985163 0.052467043
## 48   0.96 0.024638912 0.9995954 0.024234273
## 49   0.98 0.006796941 1.0000000 0.006796941
#optimal cutoff of .24
confusionMatrix(validate$over_50k, validate$p_hat, threshold =0.24)
##      0    1
## 0 5843  393
## 1 1571 1961
plotROC(validate$over_50k, validate$p_hat)

#concordance 0.8917



#running full.model5 on validate
new_data<-data.frame(validate, 
                        'Pred'=predict(full.model5, newdata=validate, 
                                        type ="response"))



#Looking at Probability Based Metrics for Assessing Predictive Power: full.model4
#looking at Coefficient of Discrimination (re-do on validation and test!)
validate$p_hat<-predict(full.model5, newdata=validate, type ="response") 
p1 <-validate$p_hat[validate$over_50k==1] 
p0 <-validate$p_hat[validate$over_50k==0] 

coef_discrim<-mean(p1) -mean(p0)
coef_discrim
## [1] 0.4167998
#0.4167998


ggplot(validate, aes(p_hat, fill =factor(over_50k))) +geom_density(alpha =0.7) +scale_fill_grey() +labs(x ="Predicted Probability", fill ="Outcome", title =paste("Coefficient of Discrimination = ", round(coef_discrim, 3), sep=""))

InformationValue::Concordance(validate$over_50k, validate$p_hat)
## $Concordance
## [1] 0.8981847
## 
## $Discordance
## [1] 0.1018153
## 
## $Tied
## [1] 1.387779e-17
## 
## $Pairs
## [1] 17452556
# $Concordance
# [1] 0.8981847
# 
# $Discordance
# [1] 0.1018153
# 
# $Tied
# [1] 1.387779e-17
# 
# $Pairs
# [1] 17452556
InformationValue::somersD(validate$over_50k, validate$p_hat)
## [1] 0.7963693
#0.7963693

library(InformationValue)
sens<-NULL
spec <-NULL
youden<-NULL
cutoff <-NULL
for(i in 1:49){ 
    cutoff =c(cutoff, i/50) 
    sens<-c(sens, sensitivity(validate$over_50k, validate$p_hat, threshold =i/50)) 
    spec <-c(spec, specificity(validate$over_50k, validate$p_hat, threshold =i/50)) 
    youden<-c(youden, youdensIndex(validate$over_50k, validate$p_hat, threshold =i/50)) 
} 
ctable<-data.frame(cutoff, sens, spec, youden) 

print(ctable[order(-youden),])
##    cutoff        sens      spec      youden
## 11   0.22 0.875106202 0.7607230 0.635829159
## 10   0.20 0.885301614 0.7471001 0.632401695
## 12   0.24 0.841121495 0.7891826 0.630304123
## 13   0.26 0.826677995 0.8029404 0.629618378
## 9    0.18 0.898045879 0.7290262 0.627072046
## 14   0.28 0.816057774 0.8099541 0.626011915
## 15   0.30 0.795666950 0.8223631 0.618030047
## 8    0.16 0.911214953 0.7063663 0.617581287
## 16   0.32 0.774001699 0.8411114 0.615113110
## 7    0.14 0.930331351 0.6804694 0.610800733
## 17   0.34 0.745964316 0.8571621 0.603126442
## 18   0.36 0.730671198 0.8643108 0.594981961
## 6    0.12 0.943925234 0.6507958 0.594721025
## 19   0.38 0.714953271 0.8734826 0.588435872
## 5    0.10 0.955819881 0.6222012 0.578021122
## 20   0.40 0.689889550 0.8827893 0.572678867
## 21   0.42 0.669073917 0.8931751 0.562248991
## 22   0.44 0.648683093 0.9016725 0.550355604
## 4    0.08 0.965590484 0.5817373 0.547327738
## 23   0.46 0.633814783 0.9073375 0.541152253
## 24   0.48 0.612999150 0.9174535 0.530452617
## 25   0.50 0.592183517 0.9254114 0.517594901
## 3    0.06 0.979609176 0.5236040 0.503213168
## 26   0.52 0.558623619 0.9333693 0.491992921
## 27   0.54 0.545454545 0.9372808 0.482735366
## 28   0.56 0.524214104 0.9409226 0.465136683
## 29   0.58 0.509345794 0.9446992 0.454045012
## 2    0.04 0.989804588 0.4465875 0.436392125
## 30   0.60 0.477485132 0.9530618 0.430546907
## 31   0.62 0.458793543 0.9562989 0.415092437
## 32   0.64 0.440526763 0.9615592 0.402085975
## 33   0.66 0.407816483 0.9669544 0.374770893
## 34   0.68 0.389124894 0.9696520 0.358776904
## 35   0.70 0.368309261 0.9743728 0.342682069
## 1    0.02 0.996176720 0.3235770 0.319753737
## 36   0.72 0.341121495 0.9773402 0.318461663
## 37   0.74 0.322854715 0.9797680 0.302622722
## 38   0.76 0.281648258 0.9854330 0.267081223
## 39   0.78 0.264655905 0.9874562 0.252112069
## 40   0.80 0.232795242 0.9897491 0.222544365
## 41   0.82 0.214528462 0.9910979 0.205626385
## 42   0.84 0.183942226 0.9935258 0.177467988
## 43   0.86 0.153780799 0.9958187 0.149599520
## 44   0.88 0.132115548 0.9960885 0.128204029
## 45   0.90 0.105352591 0.9974373 0.102789872
## 46   0.92 0.080713679 0.9981117 0.078825359
## 47   0.94 0.056074766 0.9986512 0.054725967
## 48   0.96 0.027187766 0.9994605 0.026648246
## 49   0.98 0.006796941 1.0000000 0.006796941
#optimal cutoff of .22
confusionMatrix(validate$over_50k, validate$p_hat, threshold =0.22)
##      0    1
## 0 5640  294
## 1 1774 2060
plotROC(validate$over_50k, validate$p_hat)

#concordance 0.8982

full.model5 preformed just slightly better I will take that one to test as my final model.

Testing the model

#STEP 4: Test

#preform all binning on test:
test$capital_gain <- ifelse(test$capital_gain == 0, 0, 1)
test$capital_loss <- ifelse(test$capital_loss == 0, 0, 1)
test$hours_week = ifelse(test$hours_week < 40, 1, test$hours_week)
test$hours_week_bin = test$hours_week
test$hours_week_bin = ifelse(test$hours_week_bin > 40, 2, test$hours_week_bin)
test$hours_week_bins = test$hours_week_bin
test$hours_week_bins = ifelse(test$hours_week_bins == 40, 0, test$hours_week_bins)
test <- test%>%mutate(age_bin = cut(age, breaks = c(0,24,44,64,140)))
head(test,10)
##    id age        workclass education_level education_num     marital_status
## 2   2  50 Self-emp-not-inc       Bachelors            13 Married-civ-spouse
## 8   8  52 Self-emp-not-inc         HS-grad             9 Married-civ-spouse
## 12 12  30        State-gov       Bachelors            13 Married-civ-spouse
## 21 21  40          Private       Doctorate            16 Married-civ-spouse
## 22 22  54          Private         HS-grad             9          Separated
## 36 36  48          Private            11th             7      Never-married
## 37 37  21          Private    Some-college            10      Never-married
## 57 57  46          Private         5th-6th             3 Married-civ-spouse
## 61 61  30          Private       Bachelors            13 Married-civ-spouse
## 72 72  31          Private       Bachelors            13          Separated
##           occupation relationship               race    sex capital_gain
## 2    Exec-managerial      Husband              White   Male            0
## 8    Exec-managerial      Husband              White   Male            0
## 12    Prof-specialty      Husband Asian-Pac-Islander   Male            0
## 21    Prof-specialty      Husband              White   Male            0
## 22     Other-service    Unmarried              Black Female            0
## 36 Machine-op-inspct    Unmarried              White   Male            0
## 37 Machine-op-inspct    Own-child              White   Male            0
## 57 Machine-op-inspct      Husband              White   Male            0
## 61             Sales      Husband              White   Male            1
## 72             Sales    Own-child              Black Female            0
##    capital_loss hours_week       country over_50k hours_week_bin
## 2             0          1 United-States        0              1
## 8             0         45 United-States        1              2
## 12            0         40         India        1             40
## 21            0         60 United-States        1              2
## 22            0          1 United-States        0              1
## 36            0         40   Puerto-Rico        0             40
## 37            0         40 United-States        0             40
## 57            0         40        Mexico        0             40
## 61            0         40 United-States        0             40
## 72            0         40 United-States        0             40
##    hours_week_bins age_bin
## 2                1 (44,64]
## 8                2 (44,64]
## 12               0 (24,44]
## 21               2 (24,44]
## 22               1 (44,64]
## 36               0 (44,64]
## 37               0  (0,24]
## 57               0 (44,64]
## 61               0 (24,44]
## 72               0 (24,44]
test$country<- ifelse(test$country == 'Holand-Netherlands'|test$country == 'Honduras' |test$country == 'Loas', '?', test$country)
test$education_level<- ifelse(test$education_level == 'Preschool', '1st-4th', test$education_level)
test$workclass<- ifelse(test$workclass == 'Never-worked', '?', test$workclass)

#Re-leveling Education_level
test$education_level <- factor(test$education_level, levels = c("HS-grad", "Preschool", "1st-4th", "5th-6th","7th-8th","9th","10th","11th","12th","Some-college","Assoc-voc","Assoc-acdm","Bachelors","Masters","Prof-school","Doctorate"))
test %>% count(education_level, sort = TRUE)
##    education_level    n
## 1          HS-grad 1592
## 2     Some-college 1085
## 3        Bachelors  808
## 4          Masters  275
## 5        Assoc-voc  186
## 6             11th  184
## 7       Assoc-acdm  166
## 8             10th  152
## 9      Prof-school   85
## 10         7th-8th   78
## 11             9th   73
## 12            12th   67
## 13         5th-6th   51
## 14       Doctorate   51
## 15         1st-4th   31
#Looking at Probability Based Metrics for Assessing Predictive Power:
#looking at Coefficient of Discrimination test
test$p_hat<-predict(full.model5, newdata=test, type ="response") 
p1 <-test$p_hat[test$over_50k==1] 
p0 <-test$p_hat[test$over_50k==0] 

#Coefficient of Discrimination
coef_discrim<-mean(p1) -mean(p0)
coef_discrim
## [1] 0.4080256
#0.4080256

#Plotting the distribution of 0's and 1's
ggplot(test, aes(p_hat, fill =factor(over_50k))) +geom_density(alpha =0.7) +scale_fill_grey() +labs(x ="Predicted Probability", fill ="Outcome", title =paste("Coefficient of Discrimination = ", round(coef_discrim, 3), sep=""))

InformationValue::Concordance(test$over_50k, test$p_hat)
## $Concordance
## [1] 0.8921164
## 
## $Discordance
## [1] 0.1078836
## 
## $Tied
## [1] 4.163336e-17
## 
## $Pairs
## [1] 4388339
# $Concordance
# [1] 0.8921164
# 
# $Discordance
# [1] 0.1078836
# 
# $Tied
# [1] 4.163336e-17
# 
# $Pairs
# [1] 4388339
InformationValue::somersD(test$over_50k, test$p_hat)
## [1] 0.7842327
# 0.7842327


#Looking at Classification Based Metrics for Assessing Predictive Power:

library(InformationValue)
sens<-NULL
spec <-NULL
youden<-NULL
cutoff <-NULL
for(i in 1:49){ 
    cutoff =c(cutoff, i/50) 
    sens<-c(sens, sensitivity(test$over_50k, test$p_hat, threshold =i/50)) 
    spec <-c(spec, specificity(test$over_50k, test$p_hat, threshold =i/50)) 
    youden<-c(youden, youdensIndex(test$over_50k, test$p_hat, threshold =i/50)) 
} 
ctable<-data.frame(cutoff, sens, spec, youden) 

print(ctable[order(-youden),])
##    cutoff       sens      spec     youden
## 13   0.26 0.82982308 0.7968623 0.62668540
## 14   0.28 0.82139848 0.8049770 0.62637549
## 12   0.24 0.84077506 0.7803625 0.62113752
## 11   0.22 0.87363100 0.7473627 0.62099373
## 15   0.30 0.79865206 0.8187720 0.61742404
## 10   0.20 0.88205560 0.7346497 0.61670532
## 9    0.18 0.89132266 0.7176089 0.60893153
## 16   0.32 0.77000842 0.8360833 0.60609174
## 8    0.16 0.90227464 0.6935353 0.59580994
## 17   0.34 0.73546757 0.8555586 0.59102613
## 7    0.14 0.91912384 0.6667568 0.58588067
## 18   0.36 0.72030329 0.8636732 0.58397653
## 19   0.38 0.70513901 0.8744928 0.57963184
## 20   0.40 0.68828981 0.8858534 0.57414320
## 6    0.12 0.93428812 0.6375440 0.57183208
## 5    0.10 0.94692502 0.6069786 0.55390365
## 21   0.42 0.65627633 0.8955910 0.55186735
## 22   0.44 0.63352991 0.9015418 0.53507170
## 4    0.08 0.95956192 0.5685691 0.52813103
## 23   0.46 0.60825611 0.9069516 0.51520769
## 24   0.48 0.58887953 0.9142548 0.50313433
## 25   0.50 0.57203033 0.9231810 0.49521129
## 3    0.06 0.97304128 0.5139302 0.48697149
## 26   0.52 0.54844145 0.9318366 0.48027807
## 27   0.54 0.53580455 0.9367054 0.47250999
## 28   0.56 0.51811289 0.9421152 0.46022812
## 29   0.58 0.49873631 0.9464431 0.44517937
## 2    0.04 0.98567818 0.4441439 0.42982208
## 30   0.60 0.47346251 0.9515824 0.42504487
## 31   0.62 0.45661331 0.9559102 0.41252351
## 32   0.64 0.44060657 0.9607790 0.40138558
## 33   0.66 0.41617523 0.9664593 0.38263452
## 34   0.68 0.39090143 0.9715986 0.36250003
## 35   0.70 0.37320977 0.9740330 0.34724277
## 1    0.02 0.99578770 0.3226941 0.31848178
## 36   0.72 0.34035383 0.9775494 0.31790320
## 37   0.74 0.32013479 0.9791723 0.29930710
## 38   0.76 0.27042965 0.9853936 0.25582322
## 39   0.78 0.25105307 0.9878280 0.23888104
## 40   0.80 0.21903960 0.9894509 0.20849050
## 41   0.82 0.20556024 0.9910738 0.19663408
## 42   0.84 0.18028644 0.9937787 0.17406518
## 43   0.86 0.15080034 0.9959427 0.14674299
## 44   0.88 0.12636900 0.9964836 0.12285263
## 45   0.90 0.09856782 0.9972951 0.09586292
## 46   0.92 0.07329402 0.9983771 0.07167108
## 47   0.94 0.05054760 0.9989180 0.04946564
## 48   0.96 0.02443134 0.9994590 0.02389036
## 49   0.98 0.00758214 1.0000000 0.00758214
#optimal cutoff of .26
confusionMatrix(test$over_50k, test$p_hat, threshold =0.26)
##      0   1
## 0 2946 202
## 1  751 985
plotROC(test$over_50k, test$p_hat)

#concordance 0.892

#calculating KS statistic - Bank's want to know these. 
InformationValue::ks_stat(test$over_50k, test$p_hat)
## [1] 0.6217
# 0.6217

Reporting

#STEP 5: Group all data back together and run final stats

#group all data back together:

#preform all binning and separation addressing from train to whole dataset:
ovr50$capital_gain <- ifelse(ovr50$capital_gain == 0, 0, 1)
ovr50$capital_loss <- ifelse(ovr50$capital_loss == 0, 0, 1)
ovr50$hours_week = ifelse(ovr50$hours_week < 40, 1, ovr50$hours_week)
ovr50$hours_week_bin = ovr50$hours_week
ovr50$hours_week_bin = ifelse(ovr50$hours_week_bin > 40, 2, ovr50$hours_week_bin)
ovr50$hours_week_bins = ovr50$hours_week_bin
ovr50$hours_week_bins = ifelse(ovr50$hours_week_bins == 40, 0, ovr50$hours_week_bins)
ovr50 <- ovr50%>%mutate(age_bin = cut(age, breaks = c(0,24,44,64,140)))
head(ovr50,10)
##    id age        workclass education_level education_num        marital_status
## 1   1  39        State-gov       Bachelors            13         Never-married
## 2   2  50 Self-emp-not-inc       Bachelors            13    Married-civ-spouse
## 3   3  38          Private         HS-grad             9              Divorced
## 4   4  53          Private            11th             7    Married-civ-spouse
## 5   5  28          Private       Bachelors            13    Married-civ-spouse
## 6   6  37          Private         Masters            14    Married-civ-spouse
## 7   7  49          Private             9th             5 Married-spouse-absent
## 8   8  52 Self-emp-not-inc         HS-grad             9    Married-civ-spouse
## 9   9  31          Private         Masters            14         Never-married
## 10 10  42          Private       Bachelors            13    Married-civ-spouse
##           occupation  relationship  race    sex capital_gain capital_loss
## 1       Adm-clerical Not-in-family White   Male            1            0
## 2    Exec-managerial       Husband White   Male            0            0
## 3  Handlers-cleaners Not-in-family White   Male            0            0
## 4  Handlers-cleaners       Husband Black   Male            0            0
## 5     Prof-specialty          Wife Black Female            0            0
## 6    Exec-managerial          Wife White Female            0            0
## 7      Other-service Not-in-family Black Female            0            0
## 8    Exec-managerial       Husband White   Male            0            0
## 9     Prof-specialty Not-in-family White Female            1            0
## 10   Exec-managerial       Husband White   Male            1            0
##    hours_week       country over_50k hours_week_bin hours_week_bins age_bin
## 1          40 United-States        0             40               0 (24,44]
## 2           1 United-States        0              1               1 (44,64]
## 3          40 United-States        0             40               0 (24,44]
## 4          40 United-States        0             40               0 (44,64]
## 5          40          Cuba        0             40               0 (24,44]
## 6          40 United-States        0             40               0 (24,44]
## 7           1       Jamaica        0              1               1 (44,64]
## 8          45 United-States        1              2               2 (44,64]
## 9          50 United-States        1              2               2 (24,44]
## 10         40 United-States        1             40               0 (24,44]
ovr50$country<- ifelse(ovr50$country == 'Holand-Netherlands'| ovr50$country == 'Honduras' |ovr50$country == 'Loas', '?', ovr50$country)
ovr50$education_level<- ifelse(ovr50$education_level == 'Preschool', '1st-4th', ovr50$education_level)
ovr50$workclass<- ifelse(ovr50$workclass == 'Never-worked', '?', ovr50$workclass)

#Re-leveling Education_level
ovr50$education_level <- factor(ovr50$education_level, levels = c("HS-grad", "Preschool", "1st-4th", "5th-6th","7th-8th","9th","10th","11th","12th","Some-college","Assoc-voc","Assoc-acdm","Bachelors","Masters","Prof-school","Doctorate"))
ovr50 %>% count(education_level, sort = TRUE)
##    education_level     n
## 1          HS-grad 15784
## 2     Some-college 10878
## 3        Bachelors  8025
## 4          Masters  2657
## 5        Assoc-voc  2061
## 6             11th  1812
## 7       Assoc-acdm  1601
## 8             10th  1389
## 9          7th-8th   955
## 10     Prof-school   834
## 11             9th   756
## 12            12th   657
## 13       Doctorate   594
## 14         5th-6th   509
## 15         1st-4th   330
#Run the final model:
final.model <- glm(over_50k ~ factor(age_bin) + factor(occupation) + factor(education_level) + 
    factor(marital_status) + factor(race) + factor(capital_gain) + 
    factor(capital_loss) + factor(hours_week_bins) , data =ovr50, family =binomial(link ='logit'))
summary(final.model)
## 
## Call:
## glm(formula = over_50k ~ factor(age_bin) + factor(occupation) + 
##     factor(education_level) + factor(marital_status) + factor(race) + 
##     factor(capital_gain) + factor(capital_loss) + factor(hours_week_bins), 
##     family = binomial(link = "logit"), data = ovr50)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7626  -0.5328  -0.2053  -0.0362   3.8726  
## 
## Coefficients:
##                                              Estimate Std. Error z value
## (Intercept)                                 -6.108796   0.223086 -27.383
## factor(age_bin)(24,44]                       1.752303   0.112504  15.575
## factor(age_bin)(44,64]                       2.239972   0.114048  19.641
## factor(age_bin)(64,140]                      1.599827   0.133190  12.012
## factor(occupation)Adm-clerical               0.468820   0.093129   5.034
## factor(occupation)Armed-Forces               1.419920   0.831217   1.708
## factor(occupation)Craft-repair               0.383021   0.088722   4.317
## factor(occupation)Exec-managerial            1.168469   0.087257  13.391
## factor(occupation)Farming-fishing           -0.583563   0.122027  -4.782
## factor(occupation)Handlers-cleaners         -0.265439   0.127875  -2.076
## factor(occupation)Machine-op-inspct          0.005472   0.102991   0.053
## factor(occupation)Other-service             -0.444848   0.112546  -3.953
## factor(occupation)Priv-house-serv           -1.000828   0.602295  -1.662
## factor(occupation)Prof-specialty             0.907381   0.089081  10.186
## factor(occupation)Protective-serv            0.842676   0.115690   7.284
## factor(occupation)Sales                      0.667089   0.089640   7.442
## factor(occupation)Tech-support               0.965000   0.108496   8.894
## factor(occupation)Transport-moving           0.286301   0.100148   2.859
## factor(education_level)1st-4th              -1.749021   0.358902  -4.873
## factor(education_level)5th-6th              -1.159855   0.214554  -5.406
## factor(education_level)7th-8th              -1.269246   0.144657  -8.774
## factor(education_level)9th                  -1.139941   0.174496  -6.533
## factor(education_level)10th                 -0.785311   0.124140  -6.326
## factor(education_level)11th                 -0.717046   0.121074  -5.922
## factor(education_level)12th                 -0.334663   0.172864  -1.936
## factor(education_level)Some-college          0.373412   0.040309   9.264
## factor(education_level)Assoc-voc             0.460843   0.067014   6.877
## factor(education_level)Assoc-acdm            0.581448   0.076251   7.625
## factor(education_level)Bachelors             1.105687   0.043083  25.664
## factor(education_level)Masters               1.417558   0.061546  23.033
## factor(education_level)Prof-school           2.065969   0.106346  19.427
## factor(education_level)Doctorate             2.040379   0.121112  16.847
## factor(marital_status)Married-AF-spouse      2.642742   0.409673   6.451
## factor(marital_status)Married-civ-spouse     2.222323   0.049431  44.958
## factor(marital_status)Married-spouse-absent  0.248191   0.164290   1.511
## factor(marital_status)Never-married         -0.299685   0.063144  -4.746
## factor(marital_status)Separated             -0.064847   0.123659  -0.524
## factor(marital_status)Widowed                0.267414   0.114389   2.338
## factor(race)Asian-Pac-Islander               0.346505   0.187010   1.853
## factor(race)Black                            0.319316   0.178525   1.789
## factor(race)Other                            0.221406   0.254389   0.870
## factor(race)White                            0.501590   0.170236   2.946
## factor(capital_gain)1                        1.729645   0.045286  38.194
## factor(capital_loss)1                        1.123975   0.057223  19.642
## factor(hours_week_bins)1                    -0.629328   0.046423 -13.556
## factor(hours_week_bins)2                     0.437605   0.030863  14.179
##                                             Pr(>|z|)    
## (Intercept)                                  < 2e-16 ***
## factor(age_bin)(24,44]                       < 2e-16 ***
## factor(age_bin)(44,64]                       < 2e-16 ***
## factor(age_bin)(64,140]                      < 2e-16 ***
## factor(occupation)Adm-clerical              4.80e-07 ***
## factor(occupation)Armed-Forces               0.08759 .  
## factor(occupation)Craft-repair              1.58e-05 ***
## factor(occupation)Exec-managerial            < 2e-16 ***
## factor(occupation)Farming-fishing           1.73e-06 ***
## factor(occupation)Handlers-cleaners          0.03792 *  
## factor(occupation)Machine-op-inspct          0.95763    
## factor(occupation)Other-service             7.73e-05 ***
## factor(occupation)Priv-house-serv            0.09657 .  
## factor(occupation)Prof-specialty             < 2e-16 ***
## factor(occupation)Protective-serv           3.24e-13 ***
## factor(occupation)Sales                     9.93e-14 ***
## factor(occupation)Tech-support               < 2e-16 ***
## factor(occupation)Transport-moving           0.00425 ** 
## factor(education_level)1st-4th              1.10e-06 ***
## factor(education_level)5th-6th              6.45e-08 ***
## factor(education_level)7th-8th               < 2e-16 ***
## factor(education_level)9th                  6.46e-11 ***
## factor(education_level)10th                 2.52e-10 ***
## factor(education_level)11th                 3.17e-09 ***
## factor(education_level)12th                  0.05287 .  
## factor(education_level)Some-college          < 2e-16 ***
## factor(education_level)Assoc-voc            6.12e-12 ***
## factor(education_level)Assoc-acdm           2.43e-14 ***
## factor(education_level)Bachelors             < 2e-16 ***
## factor(education_level)Masters               < 2e-16 ***
## factor(education_level)Prof-school           < 2e-16 ***
## factor(education_level)Doctorate             < 2e-16 ***
## factor(marital_status)Married-AF-spouse     1.11e-10 ***
## factor(marital_status)Married-civ-spouse     < 2e-16 ***
## factor(marital_status)Married-spouse-absent  0.13087    
## factor(marital_status)Never-married         2.07e-06 ***
## factor(marital_status)Separated              0.60000    
## factor(marital_status)Widowed                0.01940 *  
## factor(race)Asian-Pac-Islander               0.06390 .  
## factor(race)Black                            0.07367 .  
## factor(race)Other                            0.38411    
## factor(race)White                            0.00321 ** 
## factor(capital_gain)1                        < 2e-16 ***
## factor(capital_loss)1                        < 2e-16 ***
## factor(hours_week_bins)1                     < 2e-16 ***
## factor(hours_week_bins)2                     < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 53751  on 48841  degrees of freedom
## Residual deviance: 32500  on 48796  degrees of freedom
## AIC: 32592
## 
## Number of Fisher Scoring iterations: 7
#Looking at Probability Based Metrics for Assessing Predictive Power:
#looking at Coefficient of Discrimination (re-do on validation and test!)
ovr50$p_hat<-predict(final.model, type ="response") 
p1 <-ovr50$p_hat[ovr50$over_50k==1] 
p0 <-ovr50$p_hat[ovr50$over_50k==0] 

#Coefficient of Discrimination
coef_discrim<-mean(p1) -mean(p0)
coef_discrim
## [1] 0.4156532
#0.4156532


#Plotting the distribution of 0's and 1's
ggplot(ovr50, aes(p_hat, fill =factor(over_50k))) +geom_density(alpha =0.7) +scale_fill_grey() +labs(x ="Predicted Probability", fill ="Outcome", title =paste("Coefficient of Discrimination = ", round(coef_discrim, 3), sep=""))

InformationValue::Concordance(ovr50$over_50k, ovr50$p_hat)
## $Concordance
## [1] 0.8976143
## 
## $Discordance
## [1] 0.1023857
## 
## $Tied
## [1] 1.387779e-17
## 
## $Pairs
## [1] 434230485
# $Concordance
# [1] 0.8976143
# 
# $Discordance
# [1] 0.1023857
# 
# $Tied
# [1] 1.387779e-17
# 
# $Pairs
# [1] 434230485
InformationValue::somersD(ovr50$over_50k, ovr50$p_hat)
## [1] 0.7952286
# 0.7952286


#Looking at Classification Based Metrics for Assessing Predictive Power:

library(InformationValue)
sens<-NULL
spec <-NULL
youden<-NULL
cutoff <-NULL
for(i in 1:49){ 
    cutoff =c(cutoff, i/50) 
    sens<-c(sens, sensitivity(ovr50$over_50k, ovr50$p_hat, threshold =i/50)) 
    spec <-c(spec, specificity(ovr50$over_50k, ovr50$p_hat, threshold =i/50)) 
    youden<-c(youden, youdensIndex(ovr50$over_50k, ovr50$p_hat, threshold =i/50)) 
} 
ctable<-data.frame(cutoff, sens, spec, youden) 

print(ctable[order(-youden),])
##    cutoff        sens      spec      youden
## 11   0.22 0.865234876 0.7678374 0.633072314
## 10   0.20 0.878583041 0.7526578 0.631240826
## 9    0.18 0.895610507 0.7325797 0.628190241
## 12   0.24 0.836057158 0.7920334 0.628090531
## 13   0.26 0.819543082 0.8051406 0.624683709
## 8    0.16 0.913493625 0.7077648 0.621258395
## 14   0.28 0.805681526 0.8150720 0.620753522
## 15   0.30 0.783947976 0.8307092 0.614657168
## 7    0.14 0.926756225 0.6856412 0.612397458
## 16   0.32 0.750492000 0.8526443 0.603136327
## 17   0.34 0.730812013 0.8643520 0.595164052
## 6    0.12 0.940104389 0.6532095 0.593313917
## 18   0.36 0.719346282 0.8709730 0.590319233
## 19   0.38 0.706682639 0.8777553 0.584437988
## 5    0.10 0.951912381 0.6202126 0.572125004
## 20   0.40 0.681013091 0.8898129 0.570826037
## 21   0.42 0.656284761 0.8996905 0.555975247
## 4    0.08 0.963891503 0.5828287 0.546720194
## 22   0.44 0.637802687 0.9063921 0.544194828
## 23   0.46 0.622657654 0.9124748 0.535132422
## 24   0.48 0.600325148 0.9205760 0.520901113
## 3    0.06 0.975956191 0.5305612 0.506517353
## 25   0.50 0.576281338 0.9291078 0.505389130
## 26   0.52 0.559596132 0.9337101 0.493306266
## 27   0.54 0.538547104 0.9385816 0.477128721
## 28   0.56 0.525797895 0.9425649 0.468362826
## 29   0.58 0.513048687 0.9461714 0.459220131
## 2    0.04 0.986138444 0.4591845 0.445322942
## 30   0.60 0.472747497 0.9549724 0.427719910
## 31   0.62 0.450842817 0.9598977 0.410740543
## 32   0.64 0.438435869 0.9630736 0.401509479
## 33   0.66 0.405236588 0.9683219 0.373558482
## 34   0.68 0.390861641 0.9706365 0.361498164
## 35   0.70 0.363052965 0.9753465 0.338399486
## 1    0.02 0.995465047 0.3352981 0.330763122
## 36   0.72 0.337554548 0.9783609 0.315915468
## 37   0.74 0.320698212 0.9802718 0.300970046
## 38   0.76 0.276375460 0.9856816 0.262057064
## 39   0.78 0.256353213 0.9875925 0.243945731
## 40   0.80 0.234961924 0.9897726 0.224734498
## 41   0.82 0.210661419 0.9915489 0.202210335
## 42   0.84 0.183708394 0.9938905 0.177598853
## 43   0.86 0.153332763 0.9958283 0.149161050
## 44   0.88 0.131342517 0.9967165 0.128058975
## 45   0.90 0.111662531 0.9974701 0.109132589
## 46   0.92 0.081286900 0.9984928 0.079779700
## 47   0.94 0.059553350 0.9990042 0.058557522
## 48   0.96 0.029947805 0.9996501 0.029597920
## 49   0.98 0.006845213 1.0000000 0.006845213
#optimal cutoff of .22
confusionMatrix(ovr50$over_50k, ovr50$p_hat, threshold =0.22)
##       0     1
## 0 28529  1575
## 1  8626 10112
plotROC(ovr50$over_50k, ovr50$p_hat)

#concordance 0.8975

#calculating KS statistic - Bank's want to know these. 
InformationValue::ks_stat(ovr50$over_50k, ovr50$p_hat)
## [1] 0.6305
# 0.6305

Calculating Odds Ratios for Reporting

#Looking at odds ratios:
oddsratios <- as.data.frame(exp( cbind(coef(final.model))))
oddsratios <- rownames_to_column(oddsratios, "Variable")
colnames(oddsratios) <- c("Variable", "Ratio")
oddsratios <- oddsratios %>% arrange( Ratio)
oddsratios
##                                       Variable        Ratio
## 1                                  (Intercept)  0.002223227
## 2               factor(education_level)1st-4th  0.173944114
## 3               factor(education_level)7th-8th  0.281043398
## 4               factor(education_level)5th-6th  0.313531775
## 5                   factor(education_level)9th  0.319837880
## 6            factor(occupation)Priv-house-serv  0.367574861
## 7                  factor(education_level)10th  0.455977918
## 8                  factor(education_level)11th  0.488192369
## 9                     factor(hours_week_bins)1  0.532949992
## 10           factor(occupation)Farming-fishing  0.557907019
## 11             factor(occupation)Other-service  0.640921644
## 12                 factor(education_level)12th  0.715579177
## 13         factor(marital_status)Never-married  0.741051475
## 14         factor(occupation)Handlers-cleaners  0.766869395
## 15             factor(marital_status)Separated  0.937211302
## 16         factor(occupation)Machine-op-inspct  1.005486944
## 17                           factor(race)Other  1.247829458
## 18 factor(marital_status)Married-spouse-absent  1.281704470
## 19               factor(marital_status)Widowed  1.306581376
## 20          factor(occupation)Transport-moving  1.331493331
## 21                           factor(race)Black  1.376185963
## 22              factor(race)Asian-Pac-Islander  1.414116262
## 23         factor(education_level)Some-college  1.452682788
## 24              factor(occupation)Craft-repair  1.466708453
## 25                    factor(hours_week_bins)2  1.548992880
## 26            factor(education_level)Assoc-voc  1.585410491
## 27              factor(occupation)Adm-clerical  1.598107586
## 28                           factor(race)White  1.651344952
## 29           factor(education_level)Assoc-acdm  1.788626759
## 30                     factor(occupation)Sales  1.948555932
## 31           factor(occupation)Protective-serv  2.322573749
## 32            factor(occupation)Prof-specialty  2.477824173
## 33              factor(occupation)Tech-support  2.624786968
## 34            factor(education_level)Bachelors  3.021297893
## 35                       factor(capital_loss)1  3.077062475
## 36           factor(occupation)Exec-managerial  3.217064801
## 37              factor(education_level)Masters  4.127031845
## 38              factor(occupation)Armed-Forces  4.136788744
## 39                     factor(age_bin)(64,140]  4.952174612
## 40                       factor(capital_gain)1  5.638652842
## 41                      factor(age_bin)(24,44]  5.767868268
## 42            factor(education_level)Doctorate  7.693524781
## 43          factor(education_level)Prof-school  7.892944841
## 44    factor(marital_status)Married-civ-spouse  9.228739986
## 45                      factor(age_bin)(44,64]  9.393067421
## 46     factor(marital_status)Married-AF-spouse 14.051678624
oddsratios$Ratio
##  [1]  0.002223227  0.173944114  0.281043398  0.313531775  0.319837880
##  [6]  0.367574861  0.455977918  0.488192369  0.532949992  0.557907019
## [11]  0.640921644  0.715579177  0.741051475  0.766869395  0.937211302
## [16]  1.005486944  1.247829458  1.281704470  1.306581376  1.331493331
## [21]  1.376185963  1.414116262  1.452682788  1.466708453  1.548992880
## [26]  1.585410491  1.598107586  1.651344952  1.788626759  1.948555932
## [31]  2.322573749  2.477824173  2.624786968  3.021297893  3.077062475
## [36]  3.217064801  4.127031845  4.136788744  4.952174612  5.638652842
## [41]  5.767868268  7.693524781  7.892944841  9.228739986  9.393067421
## [46] 14.051678624
oddsratios$Variable
##  [1] "(Intercept)"                                
##  [2] "factor(education_level)1st-4th"             
##  [3] "factor(education_level)7th-8th"             
##  [4] "factor(education_level)5th-6th"             
##  [5] "factor(education_level)9th"                 
##  [6] "factor(occupation)Priv-house-serv"          
##  [7] "factor(education_level)10th"                
##  [8] "factor(education_level)11th"                
##  [9] "factor(hours_week_bins)1"                   
## [10] "factor(occupation)Farming-fishing"          
## [11] "factor(occupation)Other-service"            
## [12] "factor(education_level)12th"                
## [13] "factor(marital_status)Never-married"        
## [14] "factor(occupation)Handlers-cleaners"        
## [15] "factor(marital_status)Separated"            
## [16] "factor(occupation)Machine-op-inspct"        
## [17] "factor(race)Other"                          
## [18] "factor(marital_status)Married-spouse-absent"
## [19] "factor(marital_status)Widowed"              
## [20] "factor(occupation)Transport-moving"         
## [21] "factor(race)Black"                          
## [22] "factor(race)Asian-Pac-Islander"             
## [23] "factor(education_level)Some-college"        
## [24] "factor(occupation)Craft-repair"             
## [25] "factor(hours_week_bins)2"                   
## [26] "factor(education_level)Assoc-voc"           
## [27] "factor(occupation)Adm-clerical"             
## [28] "factor(race)White"                          
## [29] "factor(education_level)Assoc-acdm"          
## [30] "factor(occupation)Sales"                    
## [31] "factor(occupation)Protective-serv"          
## [32] "factor(occupation)Prof-specialty"           
## [33] "factor(occupation)Tech-support"             
## [34] "factor(education_level)Bachelors"           
## [35] "factor(capital_loss)1"                      
## [36] "factor(occupation)Exec-managerial"          
## [37] "factor(education_level)Masters"             
## [38] "factor(occupation)Armed-Forces"             
## [39] "factor(age_bin)(64,140]"                    
## [40] "factor(capital_gain)1"                      
## [41] "factor(age_bin)(24,44]"                     
## [42] "factor(education_level)Doctorate"           
## [43] "factor(education_level)Prof-school"         
## [44] "factor(marital_status)Married-civ-spouse"   
## [45] "factor(age_bin)(44,64]"                     
## [46] "factor(marital_status)Married-AF-spouse"
#ordering p-values by signficance.
mainEff <- as.data.frame( summary(final.model)$coef )
mainEff <- rownames_to_column(mainEff, "Variable")
colnames(mainEff) <- c("Variable", "Estimate", "Std_Error", "z_value", "p_val")
mainEff <- mainEff %>% arrange( p_val )
mainEff
##                                       Variable     Estimate  Std_Error
## 1     factor(marital_status)Married-civ-spouse  2.222322526 0.04943124
## 2                        factor(capital_gain)1  1.729645179 0.04528587
## 3                                  (Intercept) -6.108795567 0.22308641
## 4             factor(education_level)Bachelors  1.105686505 0.04308311
## 5               factor(education_level)Masters  1.417558467 0.06154584
## 6                        factor(capital_loss)1  1.123975400 0.05722270
## 7                       factor(age_bin)(44,64]  2.239971909 0.11404834
## 8           factor(education_level)Prof-school  2.065969302 0.10634554
## 9             factor(education_level)Doctorate  2.040379038 0.12111209
## 10                      factor(age_bin)(24,44]  1.752302561 0.11250396
## 11                    factor(hours_week_bins)2  0.437604965 0.03086272
## 12                    factor(hours_week_bins)1 -0.629327684 0.04642301
## 13           factor(occupation)Exec-managerial  1.168469391 0.08725712
## 14                     factor(age_bin)(64,140]  1.599826796 0.13318955
## 15            factor(occupation)Prof-specialty  0.907380826 0.08908104
## 16         factor(education_level)Some-college  0.373412045 0.04030909
## 17              factor(occupation)Tech-support  0.964999738 0.10849567
## 18              factor(education_level)7th-8th -1.269246182 0.14465664
## 19           factor(education_level)Assoc-acdm  0.581448151 0.07625129
## 20                     factor(occupation)Sales  0.667088551 0.08963993
## 21           factor(occupation)Protective-serv  0.842675946 0.11569025
## 22            factor(education_level)Assoc-voc  0.460843359 0.06701432
## 23                  factor(education_level)9th -1.139941036 0.17449648
## 24     factor(marital_status)Married-AF-spouse  2.642741864 0.40967258
## 25                 factor(education_level)10th -0.785310897 0.12413961
## 26                 factor(education_level)11th -0.717045752 0.12107368
## 27              factor(education_level)5th-6th -1.159854568 0.21455370
## 28              factor(occupation)Adm-clerical  0.468820170 0.09312920
## 29              factor(education_level)1st-4th -1.749021216 0.35890192
## 30           factor(occupation)Farming-fishing -0.583562964 0.12202681
## 31         factor(marital_status)Never-married -0.299685190 0.06314388
## 32              factor(occupation)Craft-repair  0.383020742 0.08872182
## 33             factor(occupation)Other-service -0.444848069 0.11254594
## 34                           factor(race)White  0.501590078 0.17023578
## 35          factor(occupation)Transport-moving  0.286301118 0.10014806
## 36               factor(marital_status)Widowed  0.267414089 0.11438926
## 37         factor(occupation)Handlers-cleaners -0.265438772 0.12787514
## 38                 factor(education_level)12th -0.334663027 0.17286426
## 39              factor(race)Asian-Pac-Islander  0.346504786 0.18701013
## 40                           factor(race)Black  0.319315878 0.17852482
## 41              factor(occupation)Armed-Forces  1.419919821 0.83121662
## 42           factor(occupation)Priv-house-serv -1.000828278 0.60229493
## 43 factor(marital_status)Married-spouse-absent  0.248190809 0.16429010
## 44                           factor(race)Other  0.221405608 0.25438851
## 45             factor(marital_status)Separated -0.064846513 0.12365907
## 46         factor(occupation)Machine-op-inspct  0.005471945 0.10299114
##         z_value         p_val
## 1   44.95785657  0.000000e+00
## 2   38.19392803  0.000000e+00
## 3  -27.38309114 4.360793e-165
## 4   25.66403570 2.947861e-145
## 5   23.03256284 2.199882e-117
## 6   19.64212469  6.751358e-86
## 7   19.64054879  6.964155e-86
## 8   19.42694812  4.567227e-84
## 9   16.84702987  1.103177e-63
## 10  15.57547423  1.068576e-54
## 11  14.17908023  1.234540e-45
## 12 -13.55637516  7.263796e-42
## 13  13.39110693  6.815731e-41
## 14  12.01165397  3.086103e-33
## 15  10.18601506  2.289578e-24
## 16   9.26371757  1.974356e-20
## 17   8.89436212  5.875581e-19
## 18  -8.77419914  1.721244e-18
## 19   7.62542057  2.432401e-14
## 20   7.44186830  9.927117e-14
## 21   7.28389786  3.243106e-13
## 22   6.87678912  6.121662e-12
## 23  -6.53274505  6.457502e-11
## 24   6.45086350  1.112146e-10
## 25  -6.32603005  2.515489e-10
## 26  -5.92239181  3.172926e-09
## 27  -5.40589395  6.448594e-08
## 28   5.03408370  4.801399e-07
## 29  -4.87325672  1.097734e-06
## 30  -4.78225189  1.733423e-06
## 31  -4.74606881  2.074082e-06
## 32   4.31709759  1.580943e-05
## 33  -3.95259109  7.730949e-05
## 34   2.94644341  3.214512e-03
## 35   2.85877854  4.252756e-03
## 36   2.33775526  1.939995e-02
## 37  -2.07576521  3.791567e-02
## 38  -1.93598736  5.286925e-02
## 39   1.85286642  6.390151e-02
## 40   1.78863579  7.367349e-02
## 41   1.70824281  8.759130e-02
## 42  -1.66169134  9.657468e-02
## 43   1.51068638  1.308684e-01
## 44   0.87034436  3.841122e-01
## 45  -0.52439757  6.000020e-01
## 46   0.05313025  9.576281e-01
mainEff$p_val
##  [1]  0.000000e+00  0.000000e+00 4.360793e-165 2.947861e-145 2.199882e-117
##  [6]  6.751358e-86  6.964155e-86  4.567227e-84  1.103177e-63  1.068576e-54
## [11]  1.234540e-45  7.263796e-42  6.815731e-41  3.086103e-33  2.289578e-24
## [16]  1.974356e-20  5.875581e-19  1.721244e-18  2.432401e-14  9.927117e-14
## [21]  3.243106e-13  6.121662e-12  6.457502e-11  1.112146e-10  2.515489e-10
## [26]  3.172926e-09  6.448594e-08  4.801399e-07  1.097734e-06  1.733423e-06
## [31]  2.074082e-06  1.580943e-05  7.730949e-05  3.214512e-03  4.252756e-03
## [36]  1.939995e-02  3.791567e-02  5.286925e-02  6.390151e-02  7.367349e-02
## [41]  8.759130e-02  9.657468e-02  1.308684e-01  3.841122e-01  6.000020e-01
## [46]  9.576281e-01
mainEff$Variable
##  [1] "factor(marital_status)Married-civ-spouse"   
##  [2] "factor(capital_gain)1"                      
##  [3] "(Intercept)"                                
##  [4] "factor(education_level)Bachelors"           
##  [5] "factor(education_level)Masters"             
##  [6] "factor(capital_loss)1"                      
##  [7] "factor(age_bin)(44,64]"                     
##  [8] "factor(education_level)Prof-school"         
##  [9] "factor(education_level)Doctorate"           
## [10] "factor(age_bin)(24,44]"                     
## [11] "factor(hours_week_bins)2"                   
## [12] "factor(hours_week_bins)1"                   
## [13] "factor(occupation)Exec-managerial"          
## [14] "factor(age_bin)(64,140]"                    
## [15] "factor(occupation)Prof-specialty"           
## [16] "factor(education_level)Some-college"        
## [17] "factor(occupation)Tech-support"             
## [18] "factor(education_level)7th-8th"             
## [19] "factor(education_level)Assoc-acdm"          
## [20] "factor(occupation)Sales"                    
## [21] "factor(occupation)Protective-serv"          
## [22] "factor(education_level)Assoc-voc"           
## [23] "factor(education_level)9th"                 
## [24] "factor(marital_status)Married-AF-spouse"    
## [25] "factor(education_level)10th"                
## [26] "factor(education_level)11th"                
## [27] "factor(education_level)5th-6th"             
## [28] "factor(occupation)Adm-clerical"             
## [29] "factor(education_level)1st-4th"             
## [30] "factor(occupation)Farming-fishing"          
## [31] "factor(marital_status)Never-married"        
## [32] "factor(occupation)Craft-repair"             
## [33] "factor(occupation)Other-service"            
## [34] "factor(race)White"                          
## [35] "factor(occupation)Transport-moving"         
## [36] "factor(marital_status)Widowed"              
## [37] "factor(occupation)Handlers-cleaners"        
## [38] "factor(education_level)12th"                
## [39] "factor(race)Asian-Pac-Islander"             
## [40] "factor(race)Black"                          
## [41] "factor(occupation)Armed-Forces"             
## [42] "factor(occupation)Priv-house-serv"          
## [43] "factor(marital_status)Married-spouse-absent"
## [44] "factor(race)Other"                          
## [45] "factor(marital_status)Separated"            
## [46] "factor(occupation)Machine-op-inspct"